1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/KnownBits.h"
56 #include "llvm/Support/MathExtras.h"
57 #include "llvm/Target/TargetLowering.h"
58 #include "llvm/Target/TargetOptions.h"
65 #define DEBUG_TYPE "x86-isel"
67 STATISTIC(NumTailCalls, "Number of tail calls");
69 static cl::opt<bool> ExperimentalVectorWideningLegalization(
70 "x86-experimental-vector-widening-legalization", cl::init(false),
71 cl::desc("Enable an experimental vector type legalization through widening "
72 "rather than promotion."),
75 static cl::opt<int> ExperimentalPrefLoopAlignment(
76 "x86-experimental-pref-loop-alignment", cl::init(4),
77 cl::desc("Sets the preferable loop alignment for experiments "
78 "(the last x86-experimental-pref-loop-alignment bits"
79 " of the loop header PC will be 0)."),
82 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
83 const X86Subtarget &STI)
84 : TargetLowering(TM), Subtarget(STI) {
85 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
86 X86ScalarSSEf64 = Subtarget.hasSSE2();
87 X86ScalarSSEf32 = Subtarget.hasSSE1();
88 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
90 // Set up the TargetLowering object.
92 // X86 is weird. It always uses i8 for shift amounts and setcc results.
93 setBooleanContents(ZeroOrOneBooleanContent);
94 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
95 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
97 // For 64-bit, since we have so many registers, use the ILP scheduler.
98 // For 32-bit, use the register pressure specific scheduling.
99 // For Atom, always use ILP scheduling.
100 if (Subtarget.isAtom())
101 setSchedulingPreference(Sched::ILP);
102 else if (Subtarget.is64Bit())
103 setSchedulingPreference(Sched::ILP);
105 setSchedulingPreference(Sched::RegPressure);
106 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
107 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
109 // Bypass expensive divides and use cheaper ones.
110 if (TM.getOptLevel() >= CodeGenOpt::Default) {
111 if (Subtarget.hasSlowDivide32())
112 addBypassSlowDiv(32, 8);
113 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
114 addBypassSlowDiv(64, 32);
117 if (Subtarget.isTargetKnownWindowsMSVC() ||
118 Subtarget.isTargetWindowsItanium()) {
119 // Setup Windows compiler runtime calls.
120 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
121 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
122 setLibcallName(RTLIB::SREM_I64, "_allrem");
123 setLibcallName(RTLIB::UREM_I64, "_aullrem");
124 setLibcallName(RTLIB::MUL_I64, "_allmul");
125 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
126 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
127 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
128 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
129 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
132 if (Subtarget.isTargetDarwin()) {
133 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
134 setUseUnderscoreSetJmp(false);
135 setUseUnderscoreLongJmp(false);
136 } else if (Subtarget.isTargetWindowsGNU()) {
137 // MS runtime is weird: it exports _setjmp, but longjmp!
138 setUseUnderscoreSetJmp(true);
139 setUseUnderscoreLongJmp(false);
141 setUseUnderscoreSetJmp(true);
142 setUseUnderscoreLongJmp(true);
145 // Set up the register classes.
146 addRegisterClass(MVT::i8, &X86::GR8RegClass);
147 addRegisterClass(MVT::i16, &X86::GR16RegClass);
148 addRegisterClass(MVT::i32, &X86::GR32RegClass);
149 if (Subtarget.is64Bit())
150 addRegisterClass(MVT::i64, &X86::GR64RegClass);
152 for (MVT VT : MVT::integer_valuetypes())
153 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
155 // We don't accept any truncstore of integer registers.
156 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
157 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
158 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
159 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
160 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
161 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
163 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
165 // SETOEQ and SETUNE require checking two conditions.
166 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
167 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
168 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
169 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
170 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
171 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
173 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
175 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
177 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
179 if (Subtarget.is64Bit()) {
180 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
181 // f32/f64 are legal, f80 is custom.
182 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
184 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
185 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
186 } else if (!Subtarget.useSoftFloat()) {
187 // We have an algorithm for SSE2->double, and we turn this into a
188 // 64-bit FILD followed by conditional FADD for other targets.
189 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
190 // We have an algorithm for SSE2, and we turn this into a 64-bit
191 // FILD or VCVTUSI2SS/SD for other targets.
192 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
195 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
197 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
198 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
200 if (!Subtarget.useSoftFloat()) {
201 // SSE has no i16 to fp conversion, only i32.
202 if (X86ScalarSSEf32) {
203 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
204 // f32 and f64 cases are Legal, f80 case is not
205 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
207 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
208 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
211 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
212 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
215 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
217 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
218 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
220 if (!Subtarget.useSoftFloat()) {
221 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
222 // are Legal, f80 is custom lowered.
223 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
226 if (X86ScalarSSEf32) {
227 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
228 // f32 and f64 cases are Legal, f80 case is not
229 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
231 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
232 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
235 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
236 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
237 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
240 // Handle FP_TO_UINT by promoting the destination to a larger signed
242 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
244 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
246 if (Subtarget.is64Bit()) {
247 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
248 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
249 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
250 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
252 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
253 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
255 } else if (!Subtarget.useSoftFloat()) {
256 // Since AVX is a superset of SSE3, only check for SSE here.
257 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
258 // Expand FP_TO_UINT into a select.
259 // FIXME: We would like to use a Custom expander here eventually to do
260 // the optimal thing for SSE vs. the default expansion in the legalizer.
261 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
263 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
264 // With SSE3 we can use fisttpll to convert to a signed i64; without
265 // SSE, we're stuck with a fistpll.
266 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
268 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
271 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
272 if (!X86ScalarSSEf64) {
273 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
274 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
275 if (Subtarget.is64Bit()) {
276 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
277 // Without SSE, i64->f64 goes through memory.
278 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
280 } else if (!Subtarget.is64Bit())
281 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
283 // Scalar integer divide and remainder are lowered to use operations that
284 // produce two results, to match the available instructions. This exposes
285 // the two-result form to trivial CSE, which is able to combine x/y and x%y
286 // into a single instruction.
288 // Scalar integer multiply-high is also lowered to use two-result
289 // operations, to match the available instructions. However, plain multiply
290 // (low) operations are left as Legal, as there are single-result
291 // instructions for this in x86. Using the two-result multiply instructions
292 // when both high and low results are needed must be arranged by dagcombine.
293 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
294 setOperationAction(ISD::MULHS, VT, Expand);
295 setOperationAction(ISD::MULHU, VT, Expand);
296 setOperationAction(ISD::SDIV, VT, Expand);
297 setOperationAction(ISD::UDIV, VT, Expand);
298 setOperationAction(ISD::SREM, VT, Expand);
299 setOperationAction(ISD::UREM, VT, Expand);
302 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
303 if (VT == MVT::i64 && !Subtarget.is64Bit())
305 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
306 setOperationAction(ISD::ADDC, VT, Custom);
307 setOperationAction(ISD::ADDE, VT, Custom);
308 setOperationAction(ISD::SUBC, VT, Custom);
309 setOperationAction(ISD::SUBE, VT, Custom);
312 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
313 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
314 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
315 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
316 setOperationAction(ISD::BR_CC, VT, Expand);
317 setOperationAction(ISD::SELECT_CC, VT, Expand);
319 if (Subtarget.is64Bit())
320 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
321 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
322 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
323 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
324 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
326 setOperationAction(ISD::FREM , MVT::f32 , Expand);
327 setOperationAction(ISD::FREM , MVT::f64 , Expand);
328 setOperationAction(ISD::FREM , MVT::f80 , Expand);
329 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
331 // Promote the i8 variants and force them on up to i32 which has a shorter
333 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
334 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
335 if (!Subtarget.hasBMI()) {
336 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
337 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
338 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
339 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
340 if (Subtarget.is64Bit()) {
341 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
342 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
346 if (Subtarget.hasLZCNT()) {
347 // When promoting the i8 variants, force them to i32 for a shorter
349 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
352 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
353 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
354 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
355 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
356 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
357 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
358 if (Subtarget.is64Bit()) {
359 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
360 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
364 // Special handling for half-precision floating point conversions.
365 // If we don't have F16C support, then lower half float conversions
366 // into library calls.
367 if (Subtarget.useSoftFloat() ||
368 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
369 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
370 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
373 // There's never any support for operations beyond MVT::f32.
374 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
375 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
376 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
377 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
379 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
380 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
381 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
382 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
383 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
384 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
386 if (Subtarget.hasPOPCNT()) {
387 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
389 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
390 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
391 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
392 if (Subtarget.is64Bit())
393 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
396 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
398 if (!Subtarget.hasMOVBE())
399 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
401 // These should be promoted to a larger select which is supported.
402 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
403 // X86 wants to expand cmov itself.
404 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
405 setOperationAction(ISD::SELECT, VT, Custom);
406 setOperationAction(ISD::SETCC, VT, Custom);
408 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
409 if (VT == MVT::i64 && !Subtarget.is64Bit())
411 setOperationAction(ISD::SELECT, VT, Custom);
412 setOperationAction(ISD::SETCC, VT, Custom);
413 setOperationAction(ISD::SETCCE, VT, Custom);
415 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
416 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
417 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
418 // support continuation, user-level threading, and etc.. As a result, no
419 // other SjLj exception interfaces are implemented and please don't build
420 // your own exception handling based on them.
421 // LLVM/Clang supports zero-cost DWARF exception handling.
422 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
423 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
424 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
425 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
426 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
429 for (auto VT : { MVT::i32, MVT::i64 }) {
430 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 setOperationAction(ISD::ConstantPool , VT, Custom);
433 setOperationAction(ISD::JumpTable , VT, Custom);
434 setOperationAction(ISD::GlobalAddress , VT, Custom);
435 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
436 setOperationAction(ISD::ExternalSymbol , VT, Custom);
437 setOperationAction(ISD::BlockAddress , VT, Custom);
440 // 64-bit shl, sra, srl (iff 32-bit x86)
441 for (auto VT : { MVT::i32, MVT::i64 }) {
442 if (VT == MVT::i64 && !Subtarget.is64Bit())
444 setOperationAction(ISD::SHL_PARTS, VT, Custom);
445 setOperationAction(ISD::SRA_PARTS, VT, Custom);
446 setOperationAction(ISD::SRL_PARTS, VT, Custom);
449 if (Subtarget.hasSSE1())
450 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
452 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
454 // Expand certain atomics
455 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
456 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
457 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
458 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
459 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
460 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
461 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
462 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
465 if (Subtarget.hasCmpxchg16b()) {
466 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
469 // FIXME - use subtarget debug flags
470 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
471 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
472 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
473 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
476 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
477 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
479 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
480 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
482 setOperationAction(ISD::TRAP, MVT::Other, Legal);
483 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
485 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
486 setOperationAction(ISD::VASTART , MVT::Other, Custom);
487 setOperationAction(ISD::VAEND , MVT::Other, Expand);
488 bool Is64Bit = Subtarget.is64Bit();
489 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
490 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
492 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
493 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
495 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
497 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
498 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
499 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
501 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
502 // f32 and f64 use SSE.
503 // Set up the FP register classes.
504 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
505 : &X86::FR32RegClass);
506 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
507 : &X86::FR64RegClass);
509 for (auto VT : { MVT::f32, MVT::f64 }) {
510 // Use ANDPD to simulate FABS.
511 setOperationAction(ISD::FABS, VT, Custom);
513 // Use XORP to simulate FNEG.
514 setOperationAction(ISD::FNEG, VT, Custom);
516 // Use ANDPD and ORPD to simulate FCOPYSIGN.
517 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
519 // We don't support sin/cos/fmod
520 setOperationAction(ISD::FSIN , VT, Expand);
521 setOperationAction(ISD::FCOS , VT, Expand);
522 setOperationAction(ISD::FSINCOS, VT, Expand);
525 // Lower this to MOVMSK plus an AND.
526 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
527 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
529 // Expand FP immediates into loads from the stack, except for the special
531 addLegalFPImmediate(APFloat(+0.0)); // xorpd
532 addLegalFPImmediate(APFloat(+0.0f)); // xorps
533 } else if (UseX87 && X86ScalarSSEf32) {
534 // Use SSE for f32, x87 for f64.
535 // Set up the FP register classes.
536 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
537 : &X86::FR32RegClass);
538 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
540 // Use ANDPS to simulate FABS.
541 setOperationAction(ISD::FABS , MVT::f32, Custom);
543 // Use XORP to simulate FNEG.
544 setOperationAction(ISD::FNEG , MVT::f32, Custom);
546 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
548 // Use ANDPS and ORPS to simulate FCOPYSIGN.
549 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
550 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
552 // We don't support sin/cos/fmod
553 setOperationAction(ISD::FSIN , MVT::f32, Expand);
554 setOperationAction(ISD::FCOS , MVT::f32, Expand);
555 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
557 // Special cases we handle for FP constants.
558 addLegalFPImmediate(APFloat(+0.0f)); // xorps
559 addLegalFPImmediate(APFloat(+0.0)); // FLD0
560 addLegalFPImmediate(APFloat(+1.0)); // FLD1
561 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
562 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
564 if (!TM.Options.UnsafeFPMath) {
565 setOperationAction(ISD::FSIN , MVT::f64, Expand);
566 setOperationAction(ISD::FCOS , MVT::f64, Expand);
567 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
570 // f32 and f64 in x87.
571 // Set up the FP register classes.
572 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
573 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
575 for (auto VT : { MVT::f32, MVT::f64 }) {
576 setOperationAction(ISD::UNDEF, VT, Expand);
577 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
579 if (!TM.Options.UnsafeFPMath) {
580 setOperationAction(ISD::FSIN , VT, Expand);
581 setOperationAction(ISD::FCOS , VT, Expand);
582 setOperationAction(ISD::FSINCOS, VT, Expand);
585 addLegalFPImmediate(APFloat(+0.0)); // FLD0
586 addLegalFPImmediate(APFloat(+1.0)); // FLD1
587 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
588 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
589 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
590 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
591 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
592 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
595 // We don't support FMA.
596 setOperationAction(ISD::FMA, MVT::f64, Expand);
597 setOperationAction(ISD::FMA, MVT::f32, Expand);
599 // Long double always uses X87, except f128 in MMX.
601 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
602 addRegisterClass(MVT::f128, &X86::FR128RegClass);
603 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
604 setOperationAction(ISD::FABS , MVT::f128, Custom);
605 setOperationAction(ISD::FNEG , MVT::f128, Custom);
606 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
609 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
610 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
611 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
613 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
614 addLegalFPImmediate(TmpFlt); // FLD0
616 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
619 APFloat TmpFlt2(+1.0);
620 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
622 addLegalFPImmediate(TmpFlt2); // FLD1
623 TmpFlt2.changeSign();
624 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
627 if (!TM.Options.UnsafeFPMath) {
628 setOperationAction(ISD::FSIN , MVT::f80, Expand);
629 setOperationAction(ISD::FCOS , MVT::f80, Expand);
630 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
633 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
634 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
635 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
636 setOperationAction(ISD::FRINT, MVT::f80, Expand);
637 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
638 setOperationAction(ISD::FMA, MVT::f80, Expand);
641 // Always use a library call for pow.
642 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
643 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
644 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
646 setOperationAction(ISD::FLOG, MVT::f80, Expand);
647 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
648 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
649 setOperationAction(ISD::FEXP, MVT::f80, Expand);
650 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
651 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
652 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
654 // Some FP actions are always expanded for vector types.
655 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
656 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
657 setOperationAction(ISD::FSIN, VT, Expand);
658 setOperationAction(ISD::FSINCOS, VT, Expand);
659 setOperationAction(ISD::FCOS, VT, Expand);
660 setOperationAction(ISD::FREM, VT, Expand);
661 setOperationAction(ISD::FPOWI, VT, Expand);
662 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
663 setOperationAction(ISD::FPOW, VT, Expand);
664 setOperationAction(ISD::FLOG, VT, Expand);
665 setOperationAction(ISD::FLOG2, VT, Expand);
666 setOperationAction(ISD::FLOG10, VT, Expand);
667 setOperationAction(ISD::FEXP, VT, Expand);
668 setOperationAction(ISD::FEXP2, VT, Expand);
671 // First set operation action for all vector types to either promote
672 // (for widening) or expand (for scalarization). Then we will selectively
673 // turn on ones that can be effectively codegen'd.
674 for (MVT VT : MVT::vector_valuetypes()) {
675 setOperationAction(ISD::SDIV, VT, Expand);
676 setOperationAction(ISD::UDIV, VT, Expand);
677 setOperationAction(ISD::SREM, VT, Expand);
678 setOperationAction(ISD::UREM, VT, Expand);
679 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
680 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
681 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
682 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
683 setOperationAction(ISD::FMA, VT, Expand);
684 setOperationAction(ISD::FFLOOR, VT, Expand);
685 setOperationAction(ISD::FCEIL, VT, Expand);
686 setOperationAction(ISD::FTRUNC, VT, Expand);
687 setOperationAction(ISD::FRINT, VT, Expand);
688 setOperationAction(ISD::FNEARBYINT, VT, Expand);
689 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
690 setOperationAction(ISD::MULHS, VT, Expand);
691 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
692 setOperationAction(ISD::MULHU, VT, Expand);
693 setOperationAction(ISD::SDIVREM, VT, Expand);
694 setOperationAction(ISD::UDIVREM, VT, Expand);
695 setOperationAction(ISD::CTPOP, VT, Expand);
696 setOperationAction(ISD::CTTZ, VT, Expand);
697 setOperationAction(ISD::CTLZ, VT, Expand);
698 setOperationAction(ISD::ROTL, VT, Expand);
699 setOperationAction(ISD::ROTR, VT, Expand);
700 setOperationAction(ISD::BSWAP, VT, Expand);
701 setOperationAction(ISD::SETCC, VT, Expand);
702 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
703 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
704 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
705 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
706 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
707 setOperationAction(ISD::TRUNCATE, VT, Expand);
708 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
709 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
710 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
711 setOperationAction(ISD::SELECT_CC, VT, Expand);
712 for (MVT InnerVT : MVT::vector_valuetypes()) {
713 setTruncStoreAction(InnerVT, VT, Expand);
715 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
716 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
718 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
719 // types, we have to deal with them whether we ask for Expansion or not.
720 // Setting Expand causes its own optimisation problems though, so leave
722 if (VT.getVectorElementType() == MVT::i1)
723 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
725 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
726 // split/scalarized right now.
727 if (VT.getVectorElementType() == MVT::f16)
728 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
732 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
733 // with -msoft-float, disable use of MMX as well.
734 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
735 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
736 // No operations on x86mmx supported, everything uses intrinsics.
739 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
740 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
741 : &X86::VR128RegClass);
743 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
744 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
745 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
746 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
747 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
748 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
749 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
750 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
751 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
754 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
755 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
756 : &X86::VR128RegClass);
758 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
759 // registers cannot be used even for integer operations.
760 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
761 : &X86::VR128RegClass);
762 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
763 : &X86::VR128RegClass);
764 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
765 : &X86::VR128RegClass);
766 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
767 : &X86::VR128RegClass);
769 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
770 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
771 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
772 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
773 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
774 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
775 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
776 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
777 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
778 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
779 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
780 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
781 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
783 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
784 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
785 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
786 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
788 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
789 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
790 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
792 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
793 setOperationAction(ISD::SETCC, VT, Custom);
794 setOperationAction(ISD::CTPOP, VT, Custom);
795 setOperationAction(ISD::CTTZ, VT, Custom);
798 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
799 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
800 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
801 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
802 setOperationAction(ISD::VSELECT, VT, Custom);
803 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
806 // We support custom legalizing of sext and anyext loads for specific
807 // memory vector types which we can load as a scalar (or sequence of
808 // scalars) and extend in-register to a legal 128-bit vector type. For sext
809 // loads these must work with a single scalar load.
810 for (MVT VT : MVT::integer_vector_valuetypes()) {
811 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
812 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
813 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
814 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
815 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
816 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
817 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
818 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
819 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
822 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
823 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
824 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
825 setOperationAction(ISD::VSELECT, VT, Custom);
827 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
830 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
831 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
834 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
835 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
836 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
837 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
838 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
839 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
840 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
843 // Custom lower v2i64 and v2f64 selects.
844 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
845 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
847 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
848 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
850 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
851 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
853 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
854 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
855 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
857 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
858 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
860 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
861 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
863 for (MVT VT : MVT::fp_vector_valuetypes())
864 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
866 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
867 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
868 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
870 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
871 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
872 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
874 // In the customized shift lowering, the legal v4i32/v2i64 cases
875 // in AVX2 will be recognized.
876 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
877 setOperationAction(ISD::SRL, VT, Custom);
878 setOperationAction(ISD::SHL, VT, Custom);
879 setOperationAction(ISD::SRA, VT, Custom);
883 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
884 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
885 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
886 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
887 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
888 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
889 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
890 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
891 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
894 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
895 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
896 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
897 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
898 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
899 setOperationAction(ISD::FRINT, RoundedTy, Legal);
900 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
903 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
904 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
905 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
906 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
907 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
908 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
909 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
910 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
912 // FIXME: Do we need to handle scalar-to-vector here?
913 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
915 // We directly match byte blends in the backend as they match the VSELECT
917 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
919 // SSE41 brings specific instructions for doing vector sign extend even in
920 // cases where we don't have SRA.
921 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
922 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
923 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
926 for (MVT VT : MVT::integer_vector_valuetypes()) {
927 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
928 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
929 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
932 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
933 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
934 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
935 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
936 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
937 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
938 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
939 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
942 // i8 vectors are custom because the source register and source
943 // source memory operand types are not the same width.
944 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
947 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
948 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
949 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
950 setOperationAction(ISD::ROTL, VT, Custom);
952 // XOP can efficiently perform BITREVERSE with VPPERM.
953 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
954 setOperationAction(ISD::BITREVERSE, VT, Custom);
956 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
957 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
958 setOperationAction(ISD::BITREVERSE, VT, Custom);
961 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
962 bool HasInt256 = Subtarget.hasInt256();
964 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
965 : &X86::VR256RegClass);
966 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
967 : &X86::VR256RegClass);
968 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
969 : &X86::VR256RegClass);
970 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
971 : &X86::VR256RegClass);
972 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
973 : &X86::VR256RegClass);
974 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
975 : &X86::VR256RegClass);
977 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
978 setOperationAction(ISD::FFLOOR, VT, Legal);
979 setOperationAction(ISD::FCEIL, VT, Legal);
980 setOperationAction(ISD::FTRUNC, VT, Legal);
981 setOperationAction(ISD::FRINT, VT, Legal);
982 setOperationAction(ISD::FNEARBYINT, VT, Legal);
983 setOperationAction(ISD::FNEG, VT, Custom);
984 setOperationAction(ISD::FABS, VT, Custom);
985 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
988 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
989 // even though v8i16 is a legal type.
990 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
991 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
992 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
994 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
995 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
996 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
998 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
999 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1001 for (MVT VT : MVT::fp_vector_valuetypes())
1002 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1004 // In the customized shift lowering, the legal v8i32/v4i64 cases
1005 // in AVX2 will be recognized.
1006 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1007 setOperationAction(ISD::SRL, VT, Custom);
1008 setOperationAction(ISD::SHL, VT, Custom);
1009 setOperationAction(ISD::SRA, VT, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1013 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1014 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1016 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1017 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1018 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1019 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1022 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1023 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1024 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1025 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1027 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1028 setOperationAction(ISD::SETCC, VT, Custom);
1029 setOperationAction(ISD::CTPOP, VT, Custom);
1030 setOperationAction(ISD::CTTZ, VT, Custom);
1031 setOperationAction(ISD::CTLZ, VT, Custom);
1034 if (Subtarget.hasAnyFMA()) {
1035 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1036 MVT::v2f64, MVT::v4f64 })
1037 setOperationAction(ISD::FMA, VT, Legal);
1040 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1041 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1042 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1045 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1046 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1047 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1048 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1050 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1051 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1053 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1055 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1056 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1058 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1059 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1061 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1062 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1063 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1068 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1069 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1071 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1072 // when we have a 256bit-wide blend with immediate.
1073 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1075 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1076 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1077 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1078 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1079 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1080 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1081 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1082 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1086 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1087 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1088 setOperationAction(ISD::MLOAD, VT, Legal);
1089 setOperationAction(ISD::MSTORE, VT, Legal);
1092 // Extract subvector is special because the value type
1093 // (result) is 128-bit but the source is 256-bit wide.
1094 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1095 MVT::v4f32, MVT::v2f64 }) {
1096 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1099 // Custom lower several nodes for 256-bit types.
1100 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1101 MVT::v8f32, MVT::v4f64 }) {
1102 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1103 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1104 setOperationAction(ISD::VSELECT, VT, Custom);
1105 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1106 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1107 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1108 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1109 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1113 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1115 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1116 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1117 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1118 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1119 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1120 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1121 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1125 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1126 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1127 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1128 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1129 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1131 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1132 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1133 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1135 for (MVT VT : MVT::fp_vector_valuetypes())
1136 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1138 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1139 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1140 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1141 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1142 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1143 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1144 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1146 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1147 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1148 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1149 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1150 setOperationAction(ISD::XOR, MVT::i1, Legal);
1151 setOperationAction(ISD::OR, MVT::i1, Legal);
1152 setOperationAction(ISD::AND, MVT::i1, Legal);
1153 setOperationAction(ISD::SUB, MVT::i1, Custom);
1154 setOperationAction(ISD::ADD, MVT::i1, Custom);
1155 setOperationAction(ISD::MUL, MVT::i1, Custom);
1157 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1158 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1159 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1160 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1161 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1162 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1163 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1164 setTruncStoreAction(VT, MaskVT, Custom);
1167 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1168 setOperationAction(ISD::FNEG, VT, Custom);
1169 setOperationAction(ISD::FABS, VT, Custom);
1170 setOperationAction(ISD::FMA, VT, Legal);
1171 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1174 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1175 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1176 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1177 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1178 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1179 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1180 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1181 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1183 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1184 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1187 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1189 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1190 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1191 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1192 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1193 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1194 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1195 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1196 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1197 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1198 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1200 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1201 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1202 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1203 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1204 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1205 if (Subtarget.hasVLX()){
1206 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1207 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1208 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1209 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1210 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1212 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1213 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1214 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1215 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1216 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1218 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1219 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1220 setOperationAction(ISD::MLOAD, VT, Custom);
1221 setOperationAction(ISD::MSTORE, VT, Custom);
1224 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1225 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1226 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1228 if (Subtarget.hasDQI()) {
1229 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1230 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1231 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1232 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1233 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1235 if (Subtarget.hasVLX()) {
1236 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1237 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1238 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1239 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1242 if (Subtarget.hasVLX()) {
1243 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1244 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1245 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1246 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1247 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1248 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1249 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1250 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1251 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1252 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1255 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1256 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1258 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1259 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1260 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1261 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1262 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1263 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1264 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1265 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1268 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1269 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1270 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1271 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1272 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1273 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1274 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1276 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1277 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1279 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1280 setOperationAction(ISD::FFLOOR, VT, Legal);
1281 setOperationAction(ISD::FCEIL, VT, Legal);
1282 setOperationAction(ISD::FTRUNC, VT, Legal);
1283 setOperationAction(ISD::FRINT, VT, Legal);
1284 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1287 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1288 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1290 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1291 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1292 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1294 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1295 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1296 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1297 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1298 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1300 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1302 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1303 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1304 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1305 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1307 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1309 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1310 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1311 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1313 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1314 setOperationAction(ISD::ADD, VT, Custom);
1315 setOperationAction(ISD::SUB, VT, Custom);
1316 setOperationAction(ISD::MUL, VT, Custom);
1317 setOperationAction(ISD::SETCC, VT, Custom);
1318 setOperationAction(ISD::SELECT, VT, Custom);
1319 setOperationAction(ISD::TRUNCATE, VT, Custom);
1321 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1322 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1323 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1324 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1325 setOperationAction(ISD::VSELECT, VT, Expand);
1328 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1329 setOperationAction(ISD::SMAX, VT, Legal);
1330 setOperationAction(ISD::UMAX, VT, Legal);
1331 setOperationAction(ISD::SMIN, VT, Legal);
1332 setOperationAction(ISD::UMIN, VT, Legal);
1333 setOperationAction(ISD::ABS, VT, Legal);
1334 setOperationAction(ISD::SRL, VT, Custom);
1335 setOperationAction(ISD::SHL, VT, Custom);
1336 setOperationAction(ISD::SRA, VT, Custom);
1337 setOperationAction(ISD::CTPOP, VT, Custom);
1338 setOperationAction(ISD::CTTZ, VT, Custom);
1341 // Need to promote to 64-bit even though we have 32-bit masked instructions
1342 // because the IR optimizers rearrange bitcasts around logic ops leaving
1343 // too many variations to handle if we don't promote them.
1344 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1345 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1346 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1348 if (Subtarget.hasCDI()) {
1349 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1350 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1351 MVT::v4i64, MVT::v8i64}) {
1352 setOperationAction(ISD::CTLZ, VT, Legal);
1353 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1355 } // Subtarget.hasCDI()
1357 if (Subtarget.hasDQI()) {
1358 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1359 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1360 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1361 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1364 // Custom lower several nodes.
1365 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1366 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1367 setOperationAction(ISD::MGATHER, VT, Custom);
1368 setOperationAction(ISD::MSCATTER, VT, Custom);
1370 // Extract subvector is special because the value type
1371 // (result) is 256-bit but the source is 512-bit wide.
1372 // 128-bit was made Custom under AVX1.
1373 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1374 MVT::v8f32, MVT::v4f64 })
1375 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1376 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1377 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1378 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1380 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1381 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1382 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1383 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1384 setOperationAction(ISD::VSELECT, VT, Legal);
1385 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1386 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1387 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1388 setOperationAction(ISD::MLOAD, VT, Legal);
1389 setOperationAction(ISD::MSTORE, VT, Legal);
1390 setOperationAction(ISD::MGATHER, VT, Legal);
1391 setOperationAction(ISD::MSCATTER, VT, Custom);
1393 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1394 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1395 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1399 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1400 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1401 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1403 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1404 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1406 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1407 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1408 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1409 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1410 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1411 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1413 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1414 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1415 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1416 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1417 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1418 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1419 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1420 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1421 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1422 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1423 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1424 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1425 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1426 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1427 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1428 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1429 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1430 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1431 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1432 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1433 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1434 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1435 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1436 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1437 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1438 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1439 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1440 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1441 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1442 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1443 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1444 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1445 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1446 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1447 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1448 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1449 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1450 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1451 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1452 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1453 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1454 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1455 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1456 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1457 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1458 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1459 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1461 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1463 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1464 if (Subtarget.hasVLX()) {
1465 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1466 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1469 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1470 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1471 setOperationAction(ISD::MLOAD, VT, Action);
1472 setOperationAction(ISD::MSTORE, VT, Action);
1475 if (Subtarget.hasCDI()) {
1476 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1477 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1480 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1481 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1482 setOperationAction(ISD::VSELECT, VT, Legal);
1483 setOperationAction(ISD::ABS, VT, Legal);
1484 setOperationAction(ISD::SRL, VT, Custom);
1485 setOperationAction(ISD::SHL, VT, Custom);
1486 setOperationAction(ISD::SRA, VT, Custom);
1487 setOperationAction(ISD::MLOAD, VT, Legal);
1488 setOperationAction(ISD::MSTORE, VT, Legal);
1489 setOperationAction(ISD::CTPOP, VT, Custom);
1490 setOperationAction(ISD::CTTZ, VT, Custom);
1491 setOperationAction(ISD::SMAX, VT, Legal);
1492 setOperationAction(ISD::UMAX, VT, Legal);
1493 setOperationAction(ISD::SMIN, VT, Legal);
1494 setOperationAction(ISD::UMIN, VT, Legal);
1496 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1497 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1498 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1501 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1502 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1503 if (Subtarget.hasVLX()) {
1504 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1505 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1506 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1511 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1512 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1513 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1515 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1516 setOperationAction(ISD::ADD, VT, Custom);
1517 setOperationAction(ISD::SUB, VT, Custom);
1518 setOperationAction(ISD::MUL, VT, Custom);
1519 setOperationAction(ISD::VSELECT, VT, Expand);
1521 setOperationAction(ISD::TRUNCATE, VT, Custom);
1522 setOperationAction(ISD::SETCC, VT, Custom);
1523 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1524 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1525 setOperationAction(ISD::SELECT, VT, Custom);
1526 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1527 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1530 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1531 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1532 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1533 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1535 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1536 setOperationAction(ISD::SMAX, VT, Legal);
1537 setOperationAction(ISD::UMAX, VT, Legal);
1538 setOperationAction(ISD::SMIN, VT, Legal);
1539 setOperationAction(ISD::UMIN, VT, Legal);
1543 // We want to custom lower some of our intrinsics.
1544 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1545 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1546 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1547 if (!Subtarget.is64Bit()) {
1548 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1549 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1552 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1553 // handle type legalization for these operations here.
1555 // FIXME: We really should do custom legalization for addition and
1556 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1557 // than generic legalization for 64-bit multiplication-with-overflow, though.
1558 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1559 if (VT == MVT::i64 && !Subtarget.is64Bit())
1561 // Add/Sub/Mul with overflow operations are custom lowered.
1562 setOperationAction(ISD::SADDO, VT, Custom);
1563 setOperationAction(ISD::UADDO, VT, Custom);
1564 setOperationAction(ISD::SSUBO, VT, Custom);
1565 setOperationAction(ISD::USUBO, VT, Custom);
1566 setOperationAction(ISD::SMULO, VT, Custom);
1567 setOperationAction(ISD::UMULO, VT, Custom);
1569 // Support carry in as value rather than glue.
1570 setOperationAction(ISD::ADDCARRY, VT, Custom);
1571 setOperationAction(ISD::SUBCARRY, VT, Custom);
1574 if (!Subtarget.is64Bit()) {
1575 // These libcalls are not available in 32-bit.
1576 setLibcallName(RTLIB::SHL_I128, nullptr);
1577 setLibcallName(RTLIB::SRL_I128, nullptr);
1578 setLibcallName(RTLIB::SRA_I128, nullptr);
1581 // Combine sin / cos into one node or libcall if possible.
1582 if (Subtarget.hasSinCos()) {
1583 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1584 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1585 if (Subtarget.isTargetDarwin()) {
1586 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1587 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1588 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1589 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1593 if (Subtarget.isTargetWin64()) {
1594 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1595 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1596 setOperationAction(ISD::SREM, MVT::i128, Custom);
1597 setOperationAction(ISD::UREM, MVT::i128, Custom);
1598 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1599 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1602 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1603 // is. We should promote the value to 64-bits to solve this.
1604 // This is what the CRT headers do - `fmodf` is an inline header
1605 // function casting to f64 and calling `fmod`.
1606 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1607 Subtarget.isTargetWindowsItanium()))
1608 for (ISD::NodeType Op :
1609 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1610 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1611 if (isOperationExpand(Op, MVT::f32))
1612 setOperationAction(Op, MVT::f32, Promote);
1614 // We have target-specific dag combine patterns for the following nodes:
1615 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1616 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1617 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1618 setTargetDAGCombine(ISD::BITCAST);
1619 setTargetDAGCombine(ISD::VSELECT);
1620 setTargetDAGCombine(ISD::SELECT);
1621 setTargetDAGCombine(ISD::SHL);
1622 setTargetDAGCombine(ISD::SRA);
1623 setTargetDAGCombine(ISD::SRL);
1624 setTargetDAGCombine(ISD::OR);
1625 setTargetDAGCombine(ISD::AND);
1626 setTargetDAGCombine(ISD::ADD);
1627 setTargetDAGCombine(ISD::FADD);
1628 setTargetDAGCombine(ISD::FSUB);
1629 setTargetDAGCombine(ISD::FNEG);
1630 setTargetDAGCombine(ISD::FMA);
1631 setTargetDAGCombine(ISD::FMINNUM);
1632 setTargetDAGCombine(ISD::FMAXNUM);
1633 setTargetDAGCombine(ISD::SUB);
1634 setTargetDAGCombine(ISD::LOAD);
1635 setTargetDAGCombine(ISD::MLOAD);
1636 setTargetDAGCombine(ISD::STORE);
1637 setTargetDAGCombine(ISD::MSTORE);
1638 setTargetDAGCombine(ISD::TRUNCATE);
1639 setTargetDAGCombine(ISD::ZERO_EXTEND);
1640 setTargetDAGCombine(ISD::ANY_EXTEND);
1641 setTargetDAGCombine(ISD::SIGN_EXTEND);
1642 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1643 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1644 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1645 setTargetDAGCombine(ISD::SINT_TO_FP);
1646 setTargetDAGCombine(ISD::UINT_TO_FP);
1647 setTargetDAGCombine(ISD::SETCC);
1648 setTargetDAGCombine(ISD::MUL);
1649 setTargetDAGCombine(ISD::XOR);
1650 setTargetDAGCombine(ISD::MSCATTER);
1651 setTargetDAGCombine(ISD::MGATHER);
1653 computeRegisterProperties(Subtarget.getRegisterInfo());
1655 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1656 MaxStoresPerMemsetOptSize = 8;
1657 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1658 MaxStoresPerMemcpyOptSize = 4;
1659 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1660 MaxStoresPerMemmoveOptSize = 4;
1661 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1662 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1664 // An out-of-order CPU can speculatively execute past a predictable branch,
1665 // but a conditional move could be stalled by an expensive earlier operation.
1666 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1667 EnableExtLdPromotion = true;
1668 setPrefFunctionAlignment(4); // 2^4 bytes.
1670 verifyIntrinsicTables();
1673 // This has so far only been implemented for 64-bit MachO.
1674 bool X86TargetLowering::useLoadStackGuardNode() const {
1675 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1678 TargetLoweringBase::LegalizeTypeAction
1679 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1680 if (ExperimentalVectorWideningLegalization &&
1681 VT.getVectorNumElements() != 1 &&
1682 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1683 return TypeWidenVector;
1685 return TargetLoweringBase::getPreferredVectorAction(VT);
1688 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1689 LLVMContext& Context,
1692 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1694 if (VT.isSimple()) {
1695 MVT VVT = VT.getSimpleVT();
1696 const unsigned NumElts = VVT.getVectorNumElements();
1697 MVT EltVT = VVT.getVectorElementType();
1698 if (VVT.is512BitVector()) {
1699 if (Subtarget.hasAVX512())
1700 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1701 EltVT == MVT::f32 || EltVT == MVT::f64)
1703 case 8: return MVT::v8i1;
1704 case 16: return MVT::v16i1;
1706 if (Subtarget.hasBWI())
1707 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1709 case 32: return MVT::v32i1;
1710 case 64: return MVT::v64i1;
1714 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1715 return MVT::getVectorVT(MVT::i1, NumElts);
1717 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1718 EVT LegalVT = getTypeToTransformTo(Context, VT);
1719 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1722 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1724 case 2: return MVT::v2i1;
1725 case 4: return MVT::v4i1;
1726 case 8: return MVT::v8i1;
1730 return VT.changeVectorElementTypeToInteger();
1733 /// Helper for getByValTypeAlignment to determine
1734 /// the desired ByVal argument alignment.
1735 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1738 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1739 if (VTy->getBitWidth() == 128)
1741 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1742 unsigned EltAlign = 0;
1743 getMaxByValAlign(ATy->getElementType(), EltAlign);
1744 if (EltAlign > MaxAlign)
1745 MaxAlign = EltAlign;
1746 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1747 for (auto *EltTy : STy->elements()) {
1748 unsigned EltAlign = 0;
1749 getMaxByValAlign(EltTy, EltAlign);
1750 if (EltAlign > MaxAlign)
1751 MaxAlign = EltAlign;
1758 /// Return the desired alignment for ByVal aggregate
1759 /// function arguments in the caller parameter area. For X86, aggregates
1760 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1761 /// are at 4-byte boundaries.
1762 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1763 const DataLayout &DL) const {
1764 if (Subtarget.is64Bit()) {
1765 // Max of 8 and alignment of type.
1766 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1773 if (Subtarget.hasSSE1())
1774 getMaxByValAlign(Ty, Align);
1778 /// Returns the target specific optimal type for load
1779 /// and store operations as a result of memset, memcpy, and memmove
1780 /// lowering. If DstAlign is zero that means it's safe to destination
1781 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1782 /// means there isn't a need to check it against alignment requirement,
1783 /// probably because the source does not need to be loaded. If 'IsMemset' is
1784 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1785 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1786 /// source is constant so it does not need to be loaded.
1787 /// It returns EVT::Other if the type should be determined using generic
1788 /// target-independent logic.
1790 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1791 unsigned DstAlign, unsigned SrcAlign,
1792 bool IsMemset, bool ZeroMemset,
1794 MachineFunction &MF) const {
1795 const Function *F = MF.getFunction();
1796 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1798 (!Subtarget.isUnalignedMem16Slow() ||
1799 ((DstAlign == 0 || DstAlign >= 16) &&
1800 (SrcAlign == 0 || SrcAlign >= 16)))) {
1801 // FIXME: Check if unaligned 32-byte accesses are slow.
1802 if (Size >= 32 && Subtarget.hasAVX()) {
1803 // Although this isn't a well-supported type for AVX1, we'll let
1804 // legalization and shuffle lowering produce the optimal codegen. If we
1805 // choose an optimal type with a vector element larger than a byte,
1806 // getMemsetStores() may create an intermediate splat (using an integer
1807 // multiply) before we splat as a vector.
1810 if (Subtarget.hasSSE2())
1812 // TODO: Can SSE1 handle a byte vector?
1813 if (Subtarget.hasSSE1())
1815 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1816 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1817 // Do not use f64 to lower memcpy if source is string constant. It's
1818 // better to use i32 to avoid the loads.
1819 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1820 // The gymnastics of splatting a byte value into an XMM register and then
1821 // only using 8-byte stores (because this is a CPU with slow unaligned
1822 // 16-byte accesses) makes that a loser.
1826 // This is a compromise. If we reach here, unaligned accesses may be slow on
1827 // this target. However, creating smaller, aligned accesses could be even
1828 // slower and would certainly be a lot more code.
1829 if (Subtarget.is64Bit() && Size >= 8)
1834 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1836 return X86ScalarSSEf32;
1837 else if (VT == MVT::f64)
1838 return X86ScalarSSEf64;
1843 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1848 switch (VT.getSizeInBits()) {
1850 // 8-byte and under are always assumed to be fast.
1854 *Fast = !Subtarget.isUnalignedMem16Slow();
1857 *Fast = !Subtarget.isUnalignedMem32Slow();
1859 // TODO: What about AVX-512 (512-bit) accesses?
1862 // Misaligned accesses of any size are always allowed.
1866 /// Return the entry encoding for a jump table in the
1867 /// current function. The returned value is a member of the
1868 /// MachineJumpTableInfo::JTEntryKind enum.
1869 unsigned X86TargetLowering::getJumpTableEncoding() const {
1870 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1872 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1873 return MachineJumpTableInfo::EK_Custom32;
1875 // Otherwise, use the normal jump table encoding heuristics.
1876 return TargetLowering::getJumpTableEncoding();
1879 bool X86TargetLowering::useSoftFloat() const {
1880 return Subtarget.useSoftFloat();
1883 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1884 ArgListTy &Args) const {
1886 // Only relabel X86-32 for C / Stdcall CCs.
1887 if (Subtarget.is64Bit())
1889 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1891 unsigned ParamRegs = 0;
1892 if (auto *M = MF->getFunction()->getParent())
1893 ParamRegs = M->getNumberRegisterParameters();
1895 // Mark the first N int arguments as having reg
1896 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1897 Type *T = Args[Idx].Ty;
1898 if (T->isPointerTy() || T->isIntegerTy())
1899 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1900 unsigned numRegs = 1;
1901 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1903 if (ParamRegs < numRegs)
1905 ParamRegs -= numRegs;
1906 Args[Idx].IsInReg = true;
1912 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1913 const MachineBasicBlock *MBB,
1914 unsigned uid,MCContext &Ctx) const{
1915 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1916 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1918 return MCSymbolRefExpr::create(MBB->getSymbol(),
1919 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1922 /// Returns relocation base for the given PIC jumptable.
1923 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1924 SelectionDAG &DAG) const {
1925 if (!Subtarget.is64Bit())
1926 // This doesn't have SDLoc associated with it, but is not really the
1927 // same as a Register.
1928 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1929 getPointerTy(DAG.getDataLayout()));
1933 /// This returns the relocation base for the given PIC jumptable,
1934 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1935 const MCExpr *X86TargetLowering::
1936 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1937 MCContext &Ctx) const {
1938 // X86-64 uses RIP relative addressing based on the jump table label.
1939 if (Subtarget.isPICStyleRIPRel())
1940 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1942 // Otherwise, the reference is relative to the PIC base.
1943 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1946 std::pair<const TargetRegisterClass *, uint8_t>
1947 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1949 const TargetRegisterClass *RRC = nullptr;
1951 switch (VT.SimpleTy) {
1953 return TargetLowering::findRepresentativeClass(TRI, VT);
1954 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1955 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1958 RRC = &X86::VR64RegClass;
1960 case MVT::f32: case MVT::f64:
1961 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1962 case MVT::v4f32: case MVT::v2f64:
1963 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1964 case MVT::v8f32: case MVT::v4f64:
1965 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1966 case MVT::v16f32: case MVT::v8f64:
1967 RRC = &X86::VR128XRegClass;
1970 return std::make_pair(RRC, Cost);
1973 unsigned X86TargetLowering::getAddressSpace() const {
1974 if (Subtarget.is64Bit())
1975 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1979 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1980 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1981 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
1984 static Constant* SegmentOffset(IRBuilder<> &IRB,
1985 unsigned Offset, unsigned AddressSpace) {
1986 return ConstantExpr::getIntToPtr(
1987 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1988 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1991 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1992 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
1993 // tcbhead_t; use it instead of the usual global variable (see
1994 // sysdeps/{i386,x86_64}/nptl/tls.h)
1995 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
1996 if (Subtarget.isTargetFuchsia()) {
1997 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
1998 return SegmentOffset(IRB, 0x10, getAddressSpace());
2000 // %fs:0x28, unless we're using a Kernel code model, in which case
2001 // it's %gs:0x28. gs:0x14 on i386.
2002 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2003 return SegmentOffset(IRB, Offset, getAddressSpace());
2007 return TargetLowering::getIRStackGuard(IRB);
2010 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2011 // MSVC CRT provides functionalities for stack protection.
2012 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2013 // MSVC CRT has a global variable holding security cookie.
2014 M.getOrInsertGlobal("__security_cookie",
2015 Type::getInt8PtrTy(M.getContext()));
2017 // MSVC CRT has a function to validate security cookie.
2018 auto *SecurityCheckCookie = cast<Function>(
2019 M.getOrInsertFunction("__security_check_cookie",
2020 Type::getVoidTy(M.getContext()),
2021 Type::getInt8PtrTy(M.getContext())));
2022 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2023 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2026 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2027 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2029 TargetLowering::insertSSPDeclarations(M);
2032 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2033 // MSVC CRT has a global variable holding security cookie.
2034 if (Subtarget.getTargetTriple().isOSMSVCRT())
2035 return M.getGlobalVariable("__security_cookie");
2036 return TargetLowering::getSDagStackGuard(M);
2039 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2040 // MSVC CRT has a function to validate security cookie.
2041 if (Subtarget.getTargetTriple().isOSMSVCRT())
2042 return M.getFunction("__security_check_cookie");
2043 return TargetLowering::getSSPStackGuardCheck(M);
2046 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2047 if (Subtarget.getTargetTriple().isOSContiki())
2048 return getDefaultSafeStackPointerLocation(IRB, false);
2050 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2051 // definition of TLS_SLOT_SAFESTACK in
2052 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2053 if (Subtarget.isTargetAndroid()) {
2054 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2056 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2057 return SegmentOffset(IRB, Offset, getAddressSpace());
2060 // Fuchsia is similar.
2061 if (Subtarget.isTargetFuchsia()) {
2062 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2063 return SegmentOffset(IRB, 0x18, getAddressSpace());
2066 return TargetLowering::getSafeStackPointerLocation(IRB);
2069 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2070 unsigned DestAS) const {
2071 assert(SrcAS != DestAS && "Expected different address spaces!");
2073 return SrcAS < 256 && DestAS < 256;
2076 //===----------------------------------------------------------------------===//
2077 // Return Value Calling Convention Implementation
2078 //===----------------------------------------------------------------------===//
2080 #include "X86GenCallingConv.inc"
2082 bool X86TargetLowering::CanLowerReturn(
2083 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2084 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2085 SmallVector<CCValAssign, 16> RVLocs;
2086 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2087 return CCInfo.CheckReturn(Outs, RetCC_X86);
2090 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2091 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2095 /// Lowers masks values (v*i1) to the local register values
2096 /// \returns DAG node after lowering to register type
2097 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2098 const SDLoc &Dl, SelectionDAG &DAG) {
2099 EVT ValVT = ValArg.getValueType();
2101 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2102 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2103 // Two stage lowering might be required
2104 // bitcast: v8i1 -> i8 / v16i1 -> i16
2105 // anyextend: i8 -> i32 / i16 -> i32
2106 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2107 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2108 if (ValLoc == MVT::i32)
2109 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2111 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2112 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2113 // One stage lowering is required
2114 // bitcast: v32i1 -> i32 / v64i1 -> i64
2115 return DAG.getBitcast(ValLoc, ValArg);
2117 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2120 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2121 static void Passv64i1ArgInRegs(
2122 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2123 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2124 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2125 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2126 "Expected AVX512BW or AVX512BMI target!");
2127 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2128 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2129 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2130 "The value should reside in two registers");
2132 // Before splitting the value we cast it to i64
2133 Arg = DAG.getBitcast(MVT::i64, Arg);
2135 // Splitting the value into two i32 types
2137 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2138 DAG.getConstant(0, Dl, MVT::i32));
2139 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2140 DAG.getConstant(1, Dl, MVT::i32));
2142 // Attach the two i32 types into corresponding registers
2143 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2144 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2148 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2150 const SmallVectorImpl<ISD::OutputArg> &Outs,
2151 const SmallVectorImpl<SDValue> &OutVals,
2152 const SDLoc &dl, SelectionDAG &DAG) const {
2153 MachineFunction &MF = DAG.getMachineFunction();
2154 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2156 // In some cases we need to disable registers from the default CSR list.
2157 // For example, when they are used for argument passing.
2158 bool ShouldDisableCalleeSavedRegister =
2159 CallConv == CallingConv::X86_RegCall ||
2160 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2162 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2163 report_fatal_error("X86 interrupts may not return any value");
2165 SmallVector<CCValAssign, 16> RVLocs;
2166 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2167 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2170 SmallVector<SDValue, 6> RetOps;
2171 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2172 // Operand #1 = Bytes To Pop
2173 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2176 // Copy the result values into the output registers.
2177 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2179 CCValAssign &VA = RVLocs[I];
2180 assert(VA.isRegLoc() && "Can only return in registers!");
2182 // Add the register to the CalleeSaveDisableRegs list.
2183 if (ShouldDisableCalleeSavedRegister)
2184 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2186 SDValue ValToCopy = OutVals[OutsIndex];
2187 EVT ValVT = ValToCopy.getValueType();
2189 // Promote values to the appropriate types.
2190 if (VA.getLocInfo() == CCValAssign::SExt)
2191 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2192 else if (VA.getLocInfo() == CCValAssign::ZExt)
2193 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2194 else if (VA.getLocInfo() == CCValAssign::AExt) {
2195 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2196 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2198 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2200 else if (VA.getLocInfo() == CCValAssign::BCvt)
2201 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2203 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2204 "Unexpected FP-extend for return value.");
2206 // If this is x86-64, and we disabled SSE, we can't return FP values,
2207 // or SSE or MMX vectors.
2208 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2209 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2210 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2211 report_fatal_error("SSE register return with SSE disabled");
2213 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2214 // llvm-gcc has never done it right and no one has noticed, so this
2215 // should be OK for now.
2216 if (ValVT == MVT::f64 &&
2217 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2218 report_fatal_error("SSE2 register return with SSE2 disabled");
2220 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2221 // the RET instruction and handled by the FP Stackifier.
2222 if (VA.getLocReg() == X86::FP0 ||
2223 VA.getLocReg() == X86::FP1) {
2224 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2225 // change the value to the FP stack register class.
2226 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2227 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2228 RetOps.push_back(ValToCopy);
2229 // Don't emit a copytoreg.
2233 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2234 // which is returned in RAX / RDX.
2235 if (Subtarget.is64Bit()) {
2236 if (ValVT == MVT::x86mmx) {
2237 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2238 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2239 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2241 // If we don't have SSE2 available, convert to v4f32 so the generated
2242 // register is legal.
2243 if (!Subtarget.hasSSE2())
2244 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2249 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2251 if (VA.needsCustom()) {
2252 assert(VA.getValVT() == MVT::v64i1 &&
2253 "Currently the only custom case is when we split v64i1 to 2 regs");
2255 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2258 assert(2 == RegsToPass.size() &&
2259 "Expecting two registers after Pass64BitArgInRegs");
2261 // Add the second register to the CalleeSaveDisableRegs list.
2262 if (ShouldDisableCalleeSavedRegister)
2263 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2265 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2268 // Add nodes to the DAG and add the values into the RetOps list
2269 for (auto &Reg : RegsToPass) {
2270 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2271 Flag = Chain.getValue(1);
2272 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2276 // Swift calling convention does not require we copy the sret argument
2277 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2279 // All x86 ABIs require that for returning structs by value we copy
2280 // the sret argument into %rax/%eax (depending on ABI) for the return.
2281 // We saved the argument into a virtual register in the entry block,
2282 // so now we copy the value out and into %rax/%eax.
2284 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2285 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2286 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2287 // either case FuncInfo->setSRetReturnReg() will have been called.
2288 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2289 // When we have both sret and another return value, we should use the
2290 // original Chain stored in RetOps[0], instead of the current Chain updated
2291 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2293 // For the case of sret and another return value, we have
2294 // Chain_0 at the function entry
2295 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2296 // If we use Chain_1 in getCopyFromReg, we will have
2297 // Val = getCopyFromReg(Chain_1)
2298 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2300 // getCopyToReg(Chain_0) will be glued together with
2301 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2302 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2303 // Data dependency from Unit B to Unit A due to usage of Val in
2304 // getCopyToReg(Chain_1, Val)
2305 // Chain dependency from Unit A to Unit B
2307 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2308 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2309 getPointerTy(MF.getDataLayout()));
2312 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2313 X86::RAX : X86::EAX;
2314 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2315 Flag = Chain.getValue(1);
2317 // RAX/EAX now acts like a return value.
2319 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2321 // Add the returned register to the CalleeSaveDisableRegs list.
2322 if (ShouldDisableCalleeSavedRegister)
2323 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2326 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2327 const MCPhysReg *I =
2328 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2331 if (X86::GR64RegClass.contains(*I))
2332 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2334 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2338 RetOps[0] = Chain; // Update chain.
2340 // Add the flag if we have it.
2342 RetOps.push_back(Flag);
2344 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2345 if (CallConv == CallingConv::X86_INTR)
2346 opcode = X86ISD::IRET;
2347 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2350 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2351 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2354 SDValue TCChain = Chain;
2355 SDNode *Copy = *N->use_begin();
2356 if (Copy->getOpcode() == ISD::CopyToReg) {
2357 // If the copy has a glue operand, we conservatively assume it isn't safe to
2358 // perform a tail call.
2359 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2361 TCChain = Copy->getOperand(0);
2362 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2365 bool HasRet = false;
2366 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2368 if (UI->getOpcode() != X86ISD::RET_FLAG)
2370 // If we are returning more than one value, we can definitely
2371 // not make a tail call see PR19530
2372 if (UI->getNumOperands() > 4)
2374 if (UI->getNumOperands() == 4 &&
2375 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2387 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2388 ISD::NodeType ExtendKind) const {
2389 MVT ReturnMVT = MVT::i32;
2391 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2392 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2393 // The ABI does not require i1, i8 or i16 to be extended.
2395 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2396 // always extending i8/i16 return values, so keep doing that for now.
2398 ReturnMVT = MVT::i8;
2401 EVT MinVT = getRegisterType(Context, ReturnMVT);
2402 return VT.bitsLT(MinVT) ? MinVT : VT;
2405 /// Reads two 32 bit registers and creates a 64 bit mask value.
2406 /// \param VA The current 32 bit value that need to be assigned.
2407 /// \param NextVA The next 32 bit value that need to be assigned.
2408 /// \param Root The parent DAG node.
2409 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2410 /// glue purposes. In the case the DAG is already using
2411 /// physical register instead of virtual, we should glue
2412 /// our new SDValue to InFlag SDvalue.
2413 /// \return a new SDvalue of size 64bit.
2414 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2415 SDValue &Root, SelectionDAG &DAG,
2416 const SDLoc &Dl, const X86Subtarget &Subtarget,
2417 SDValue *InFlag = nullptr) {
2418 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2419 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2420 assert(VA.getValVT() == MVT::v64i1 &&
2421 "Expecting first location of 64 bit width type");
2422 assert(NextVA.getValVT() == VA.getValVT() &&
2423 "The locations should have the same type");
2424 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2425 "The values should reside in two registers");
2429 SDValue ArgValueLo, ArgValueHi;
2431 MachineFunction &MF = DAG.getMachineFunction();
2432 const TargetRegisterClass *RC = &X86::GR32RegClass;
2434 // Read a 32 bit value from the registers
2435 if (nullptr == InFlag) {
2436 // When no physical register is present,
2437 // create an intermediate virtual register
2438 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2439 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2440 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2441 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2443 // When a physical register is available read the value from it and glue
2444 // the reads together.
2446 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2447 *InFlag = ArgValueLo.getValue(2);
2449 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2450 *InFlag = ArgValueHi.getValue(2);
2453 // Convert the i32 type into v32i1 type
2454 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2456 // Convert the i32 type into v32i1 type
2457 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2459 // Concatenate the two values together
2460 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2463 /// The function will lower a register of various sizes (8/16/32/64)
2464 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2465 /// \returns a DAG node contains the operand after lowering to mask type.
2466 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2467 const EVT &ValLoc, const SDLoc &Dl,
2468 SelectionDAG &DAG) {
2469 SDValue ValReturned = ValArg;
2471 if (ValVT == MVT::v64i1) {
2472 // In 32 bit machine, this case is handled by getv64i1Argument
2473 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2474 // In 64 bit machine, There is no need to truncate the value only bitcast
2477 switch (ValVT.getSimpleVT().SimpleTy) {
2488 llvm_unreachable("Expecting a vector of i1 types");
2491 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2494 return DAG.getBitcast(ValVT, ValReturned);
2497 /// Lower the result values of a call into the
2498 /// appropriate copies out of appropriate physical registers.
2500 SDValue X86TargetLowering::LowerCallResult(
2501 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2502 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2503 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2504 uint32_t *RegMask) const {
2506 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2507 // Assign locations to each value returned by this call.
2508 SmallVector<CCValAssign, 16> RVLocs;
2509 bool Is64Bit = Subtarget.is64Bit();
2510 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2512 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2514 // Copy all of the result registers out of their specified physreg.
2515 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2517 CCValAssign &VA = RVLocs[I];
2518 EVT CopyVT = VA.getLocVT();
2520 // In some calling conventions we need to remove the used registers
2521 // from the register mask.
2523 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2524 SubRegs.isValid(); ++SubRegs)
2525 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2528 // If this is x86-64, and we disabled SSE, we can't return FP values
2529 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2530 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2531 report_fatal_error("SSE register return with SSE disabled");
2534 // If we prefer to use the value in xmm registers, copy it out as f80 and
2535 // use a truncate to move it from fp stack reg to xmm reg.
2536 bool RoundAfterCopy = false;
2537 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2538 isScalarFPTypeInSSEReg(VA.getValVT())) {
2539 if (!Subtarget.hasX87())
2540 report_fatal_error("X87 register return with X87 disabled");
2542 RoundAfterCopy = (CopyVT != VA.getLocVT());
2546 if (VA.needsCustom()) {
2547 assert(VA.getValVT() == MVT::v64i1 &&
2548 "Currently the only custom case is when we split v64i1 to 2 regs");
2550 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2552 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2554 Val = Chain.getValue(0);
2555 InFlag = Chain.getValue(2);
2559 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2560 // This truncation won't change the value.
2561 DAG.getIntPtrConstant(1, dl));
2563 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2564 if (VA.getValVT().isVector() &&
2565 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2566 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2567 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2568 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2570 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2573 InVals.push_back(Val);
2579 //===----------------------------------------------------------------------===//
2580 // C & StdCall & Fast Calling Convention implementation
2581 //===----------------------------------------------------------------------===//
2582 // StdCall calling convention seems to be standard for many Windows' API
2583 // routines and around. It differs from C calling convention just a little:
2584 // callee should clean up the stack, not caller. Symbols should be also
2585 // decorated in some fancy way :) It doesn't support any vector arguments.
2586 // For info on fast calling convention see Fast Calling Convention (tail call)
2587 // implementation LowerX86_32FastCCCallTo.
2589 /// CallIsStructReturn - Determines whether a call uses struct return
2591 enum StructReturnType {
2596 static StructReturnType
2597 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2599 return NotStructReturn;
2601 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2602 if (!Flags.isSRet())
2603 return NotStructReturn;
2604 if (Flags.isInReg() || IsMCU)
2605 return RegStructReturn;
2606 return StackStructReturn;
2609 /// Determines whether a function uses struct return semantics.
2610 static StructReturnType
2611 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2613 return NotStructReturn;
2615 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2616 if (!Flags.isSRet())
2617 return NotStructReturn;
2618 if (Flags.isInReg() || IsMCU)
2619 return RegStructReturn;
2620 return StackStructReturn;
2623 /// Make a copy of an aggregate at address specified by "Src" to address
2624 /// "Dst" with size and alignment information specified by the specific
2625 /// parameter attribute. The copy will be passed as a byval function parameter.
2626 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2627 SDValue Chain, ISD::ArgFlagsTy Flags,
2628 SelectionDAG &DAG, const SDLoc &dl) {
2629 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2631 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2632 /*isVolatile*/false, /*AlwaysInline=*/true,
2633 /*isTailCall*/false,
2634 MachinePointerInfo(), MachinePointerInfo());
2637 /// Return true if the calling convention is one that we can guarantee TCO for.
2638 static bool canGuaranteeTCO(CallingConv::ID CC) {
2639 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2640 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2641 CC == CallingConv::HHVM);
2644 /// Return true if we might ever do TCO for calls with this calling convention.
2645 static bool mayTailCallThisCC(CallingConv::ID CC) {
2647 // C calling conventions:
2648 case CallingConv::C:
2649 case CallingConv::X86_64_Win64:
2650 case CallingConv::X86_64_SysV:
2651 // Callee pop conventions:
2652 case CallingConv::X86_ThisCall:
2653 case CallingConv::X86_StdCall:
2654 case CallingConv::X86_VectorCall:
2655 case CallingConv::X86_FastCall:
2658 return canGuaranteeTCO(CC);
2662 /// Return true if the function is being made into a tailcall target by
2663 /// changing its ABI.
2664 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2665 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2668 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2670 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2671 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2674 ImmutableCallSite CS(CI);
2675 CallingConv::ID CalleeCC = CS.getCallingConv();
2676 if (!mayTailCallThisCC(CalleeCC))
2683 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2684 const SmallVectorImpl<ISD::InputArg> &Ins,
2685 const SDLoc &dl, SelectionDAG &DAG,
2686 const CCValAssign &VA,
2687 MachineFrameInfo &MFI, unsigned i) const {
2688 // Create the nodes corresponding to a load from this parameter slot.
2689 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2690 bool AlwaysUseMutable = shouldGuaranteeTCO(
2691 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2692 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2694 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2696 // If value is passed by pointer we have address passed instead of the value
2697 // itself. No need to extend if the mask value and location share the same
2699 bool ExtendedInMem =
2700 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2701 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2703 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2704 ValVT = VA.getLocVT();
2706 ValVT = VA.getValVT();
2708 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2709 // taken by a return address.
2711 if (CallConv == CallingConv::X86_INTR) {
2712 // X86 interrupts may take one or two arguments.
2713 // On the stack there will be no return address as in regular call.
2714 // Offset of last argument need to be set to -4/-8 bytes.
2715 // Where offset of the first argument out of two, should be set to 0 bytes.
2716 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2717 if (Subtarget.is64Bit() && Ins.size() == 2) {
2718 // The stack pointer needs to be realigned for 64 bit handlers with error
2719 // code, so the argument offset changes by 8 bytes.
2724 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2725 // changed with more analysis.
2726 // In case of tail call optimization mark all arguments mutable. Since they
2727 // could be overwritten by lowering of arguments in case of a tail call.
2728 if (Flags.isByVal()) {
2729 unsigned Bytes = Flags.getByValSize();
2730 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2731 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2732 // Adjust SP offset of interrupt parameter.
2733 if (CallConv == CallingConv::X86_INTR) {
2734 MFI.setObjectOffset(FI, Offset);
2736 return DAG.getFrameIndex(FI, PtrVT);
2739 // This is an argument in memory. We might be able to perform copy elision.
2740 if (Flags.isCopyElisionCandidate()) {
2741 EVT ArgVT = Ins[i].ArgVT;
2743 if (Ins[i].PartOffset == 0) {
2744 // If this is a one-part value or the first part of a multi-part value,
2745 // create a stack object for the entire argument value type and return a
2746 // load from our portion of it. This assumes that if the first part of an
2747 // argument is in memory, the rest will also be in memory.
2748 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2749 /*Immutable=*/false);
2750 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2752 ValVT, dl, Chain, PartAddr,
2753 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2755 // This is not the first piece of an argument in memory. See if there is
2756 // already a fixed stack object including this offset. If so, assume it
2757 // was created by the PartOffset == 0 branch above and create a load from
2758 // the appropriate offset into it.
2759 int64_t PartBegin = VA.getLocMemOffset();
2760 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2761 int FI = MFI.getObjectIndexBegin();
2762 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2763 int64_t ObjBegin = MFI.getObjectOffset(FI);
2764 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2765 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2768 if (MFI.isFixedObjectIndex(FI)) {
2770 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2771 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2773 ValVT, dl, Chain, Addr,
2774 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2775 Ins[i].PartOffset));
2780 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2781 VA.getLocMemOffset(), isImmutable);
2783 // Set SExt or ZExt flag.
2784 if (VA.getLocInfo() == CCValAssign::ZExt) {
2785 MFI.setObjectZExt(FI, true);
2786 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2787 MFI.setObjectSExt(FI, true);
2790 // Adjust SP offset of interrupt parameter.
2791 if (CallConv == CallingConv::X86_INTR) {
2792 MFI.setObjectOffset(FI, Offset);
2795 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2796 SDValue Val = DAG.getLoad(
2797 ValVT, dl, Chain, FIN,
2798 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2799 return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
2803 // FIXME: Get this from tablegen.
2804 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2805 const X86Subtarget &Subtarget) {
2806 assert(Subtarget.is64Bit());
2808 if (Subtarget.isCallingConvWin64(CallConv)) {
2809 static const MCPhysReg GPR64ArgRegsWin64[] = {
2810 X86::RCX, X86::RDX, X86::R8, X86::R9
2812 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2815 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2816 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2818 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2821 // FIXME: Get this from tablegen.
2822 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2823 CallingConv::ID CallConv,
2824 const X86Subtarget &Subtarget) {
2825 assert(Subtarget.is64Bit());
2826 if (Subtarget.isCallingConvWin64(CallConv)) {
2827 // The XMM registers which might contain var arg parameters are shadowed
2828 // in their paired GPR. So we only need to save the GPR to their home
2830 // TODO: __vectorcall will change this.
2834 const Function *Fn = MF.getFunction();
2835 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2836 bool isSoftFloat = Subtarget.useSoftFloat();
2837 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2838 "SSE register cannot be used when SSE is disabled!");
2839 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2840 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2844 static const MCPhysReg XMMArgRegs64Bit[] = {
2845 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2846 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2848 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2852 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2853 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2854 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2855 return A.getValNo() < B.getValNo();
2860 SDValue X86TargetLowering::LowerFormalArguments(
2861 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2862 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2863 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2864 MachineFunction &MF = DAG.getMachineFunction();
2865 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2866 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2868 const Function *Fn = MF.getFunction();
2869 if (Fn->hasExternalLinkage() &&
2870 Subtarget.isTargetCygMing() &&
2871 Fn->getName() == "main")
2872 FuncInfo->setForceFramePointer(true);
2874 MachineFrameInfo &MFI = MF.getFrameInfo();
2875 bool Is64Bit = Subtarget.is64Bit();
2876 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2879 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2880 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2882 if (CallConv == CallingConv::X86_INTR) {
2883 bool isLegal = Ins.size() == 1 ||
2884 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2885 (!Is64Bit && Ins[1].VT == MVT::i32)));
2887 report_fatal_error("X86 interrupts may take one or two arguments");
2890 // Assign locations to all of the incoming arguments.
2891 SmallVector<CCValAssign, 16> ArgLocs;
2892 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2894 // Allocate shadow area for Win64.
2896 CCInfo.AllocateStack(32, 8);
2898 CCInfo.AnalyzeArguments(Ins, CC_X86);
2900 // In vectorcall calling convention a second pass is required for the HVA
2902 if (CallingConv::X86_VectorCall == CallConv) {
2903 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2906 // The next loop assumes that the locations are in the same order of the
2908 assert(isSortedByValueNo(ArgLocs) &&
2909 "Argument Location list must be sorted before lowering");
2912 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2914 assert(InsIndex < Ins.size() && "Invalid Ins index");
2915 CCValAssign &VA = ArgLocs[I];
2917 if (VA.isRegLoc()) {
2918 EVT RegVT = VA.getLocVT();
2919 if (VA.needsCustom()) {
2921 VA.getValVT() == MVT::v64i1 &&
2922 "Currently the only custom case is when we split v64i1 to 2 regs");
2924 // v64i1 values, in regcall calling convention, that are
2925 // compiled to 32 bit arch, are split up into two registers.
2927 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2929 const TargetRegisterClass *RC;
2930 if (RegVT == MVT::i32)
2931 RC = &X86::GR32RegClass;
2932 else if (Is64Bit && RegVT == MVT::i64)
2933 RC = &X86::GR64RegClass;
2934 else if (RegVT == MVT::f32)
2935 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2936 else if (RegVT == MVT::f64)
2937 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2938 else if (RegVT == MVT::f80)
2939 RC = &X86::RFP80RegClass;
2940 else if (RegVT == MVT::f128)
2941 RC = &X86::FR128RegClass;
2942 else if (RegVT.is512BitVector())
2943 RC = &X86::VR512RegClass;
2944 else if (RegVT.is256BitVector())
2945 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2946 else if (RegVT.is128BitVector())
2947 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2948 else if (RegVT == MVT::x86mmx)
2949 RC = &X86::VR64RegClass;
2950 else if (RegVT == MVT::i1)
2951 RC = &X86::VK1RegClass;
2952 else if (RegVT == MVT::v8i1)
2953 RC = &X86::VK8RegClass;
2954 else if (RegVT == MVT::v16i1)
2955 RC = &X86::VK16RegClass;
2956 else if (RegVT == MVT::v32i1)
2957 RC = &X86::VK32RegClass;
2958 else if (RegVT == MVT::v64i1)
2959 RC = &X86::VK64RegClass;
2961 llvm_unreachable("Unknown argument type!");
2963 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2964 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2967 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2968 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2970 if (VA.getLocInfo() == CCValAssign::SExt)
2971 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2972 DAG.getValueType(VA.getValVT()));
2973 else if (VA.getLocInfo() == CCValAssign::ZExt)
2974 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2975 DAG.getValueType(VA.getValVT()));
2976 else if (VA.getLocInfo() == CCValAssign::BCvt)
2977 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2979 if (VA.isExtInLoc()) {
2980 // Handle MMX values passed in XMM regs.
2981 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2982 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2983 else if (VA.getValVT().isVector() &&
2984 VA.getValVT().getScalarType() == MVT::i1 &&
2985 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2986 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2987 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2988 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2990 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2993 assert(VA.isMemLoc());
2995 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
2998 // If value is passed via pointer - do a load.
2999 if (VA.getLocInfo() == CCValAssign::Indirect)
3001 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3003 InVals.push_back(ArgValue);
3006 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3007 // Swift calling convention does not require we copy the sret argument
3008 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3009 if (CallConv == CallingConv::Swift)
3012 // All x86 ABIs require that for returning structs by value we copy the
3013 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3014 // the argument into a virtual register so that we can access it from the
3016 if (Ins[I].Flags.isSRet()) {
3017 unsigned Reg = FuncInfo->getSRetReturnReg();
3019 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3020 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3021 FuncInfo->setSRetReturnReg(Reg);
3023 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3024 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3029 unsigned StackSize = CCInfo.getNextStackOffset();
3030 // Align stack specially for tail calls.
3031 if (shouldGuaranteeTCO(CallConv,
3032 MF.getTarget().Options.GuaranteedTailCallOpt))
3033 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3035 // If the function takes variable number of arguments, make a frame index for
3036 // the start of the first vararg value... for expansion of llvm.va_start. We
3037 // can skip this if there are no va_start calls.
3038 if (MFI.hasVAStart() &&
3039 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3040 CallConv != CallingConv::X86_ThisCall))) {
3041 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3044 // Figure out if XMM registers are in use.
3045 assert(!(Subtarget.useSoftFloat() &&
3046 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3047 "SSE register cannot be used when SSE is disabled!");
3049 // 64-bit calling conventions support varargs and register parameters, so we
3050 // have to do extra work to spill them in the prologue.
3051 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3052 // Find the first unallocated argument registers.
3053 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3054 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3055 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3056 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3057 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3058 "SSE register cannot be used when SSE is disabled!");
3060 // Gather all the live in physical registers.
3061 SmallVector<SDValue, 6> LiveGPRs;
3062 SmallVector<SDValue, 8> LiveXMMRegs;
3064 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3065 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3067 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3069 if (!ArgXMMs.empty()) {
3070 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3071 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3072 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3073 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3074 LiveXMMRegs.push_back(
3075 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3080 // Get to the caller-allocated home save location. Add 8 to account
3081 // for the return address.
3082 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3083 FuncInfo->setRegSaveFrameIndex(
3084 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3085 // Fixup to set vararg frame on shadow area (4 x i64).
3087 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3089 // For X86-64, if there are vararg parameters that are passed via
3090 // registers, then we must store them to their spots on the stack so
3091 // they may be loaded by dereferencing the result of va_next.
3092 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3093 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3094 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3095 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3098 // Store the integer parameter registers.
3099 SmallVector<SDValue, 8> MemOps;
3100 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3101 getPointerTy(DAG.getDataLayout()));
3102 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3103 for (SDValue Val : LiveGPRs) {
3104 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3105 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3107 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3108 MachinePointerInfo::getFixedStack(
3109 DAG.getMachineFunction(),
3110 FuncInfo->getRegSaveFrameIndex(), Offset));
3111 MemOps.push_back(Store);
3115 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3116 // Now store the XMM (fp + vector) parameter registers.
3117 SmallVector<SDValue, 12> SaveXMMOps;
3118 SaveXMMOps.push_back(Chain);
3119 SaveXMMOps.push_back(ALVal);
3120 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3121 FuncInfo->getRegSaveFrameIndex(), dl));
3122 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3123 FuncInfo->getVarArgsFPOffset(), dl));
3124 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3126 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3127 MVT::Other, SaveXMMOps));
3130 if (!MemOps.empty())
3131 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3134 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3135 // Find the largest legal vector type.
3136 MVT VecVT = MVT::Other;
3137 // FIXME: Only some x86_32 calling conventions support AVX512.
3138 if (Subtarget.hasAVX512() &&
3139 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3140 CallConv == CallingConv::Intel_OCL_BI)))
3141 VecVT = MVT::v16f32;
3142 else if (Subtarget.hasAVX())
3144 else if (Subtarget.hasSSE2())
3147 // We forward some GPRs and some vector types.
3148 SmallVector<MVT, 2> RegParmTypes;
3149 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3150 RegParmTypes.push_back(IntVT);
3151 if (VecVT != MVT::Other)
3152 RegParmTypes.push_back(VecVT);
3154 // Compute the set of forwarded registers. The rest are scratch.
3155 SmallVectorImpl<ForwardedRegister> &Forwards =
3156 FuncInfo->getForwardedMustTailRegParms();
3157 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3159 // Conservatively forward AL on x86_64, since it might be used for varargs.
3160 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3161 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3162 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3165 // Copy all forwards from physical to virtual registers.
3166 for (ForwardedRegister &F : Forwards) {
3167 // FIXME: Can we use a less constrained schedule?
3168 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3169 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3170 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3174 // Some CCs need callee pop.
3175 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3176 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3177 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3178 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3179 // X86 interrupts must pop the error code (and the alignment padding) if
3181 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3183 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3184 // If this is an sret function, the return should pop the hidden pointer.
3185 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3186 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3187 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3188 FuncInfo->setBytesToPopOnReturn(4);
3192 // RegSaveFrameIndex is X86-64 only.
3193 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3194 if (CallConv == CallingConv::X86_FastCall ||
3195 CallConv == CallingConv::X86_ThisCall)
3196 // fastcc functions can't have varargs.
3197 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3200 FuncInfo->setArgumentStackSize(StackSize);
3202 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3203 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3204 if (Personality == EHPersonality::CoreCLR) {
3206 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3207 // that we'd prefer this slot be allocated towards the bottom of the frame
3208 // (i.e. near the stack pointer after allocating the frame). Every
3209 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3210 // offset from the bottom of this and each funclet's frame must be the
3211 // same, so the size of funclets' (mostly empty) frames is dictated by
3212 // how far this slot is from the bottom (since they allocate just enough
3213 // space to accommodate holding this slot at the correct offset).
3214 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3215 EHInfo->PSPSymFrameIdx = PSPSymFI;
3219 if (CallConv == CallingConv::X86_RegCall ||
3220 Fn->hasFnAttribute("no_caller_saved_registers")) {
3221 const MachineRegisterInfo &MRI = MF.getRegInfo();
3222 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3223 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3229 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3230 SDValue Arg, const SDLoc &dl,
3232 const CCValAssign &VA,
3233 ISD::ArgFlagsTy Flags) const {
3234 unsigned LocMemOffset = VA.getLocMemOffset();
3235 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3236 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3238 if (Flags.isByVal())
3239 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3241 return DAG.getStore(
3242 Chain, dl, Arg, PtrOff,
3243 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3246 /// Emit a load of return address if tail call
3247 /// optimization is performed and it is required.
3248 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3249 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3250 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3251 // Adjust the Return address stack slot.
3252 EVT VT = getPointerTy(DAG.getDataLayout());
3253 OutRetAddr = getReturnAddressFrameIndex(DAG);
3255 // Load the "old" Return address.
3256 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3257 return SDValue(OutRetAddr.getNode(), 1);
3260 /// Emit a store of the return address if tail call
3261 /// optimization is performed and it is required (FPDiff!=0).
3262 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3263 SDValue Chain, SDValue RetAddrFrIdx,
3264 EVT PtrVT, unsigned SlotSize,
3265 int FPDiff, const SDLoc &dl) {
3266 // Store the return address to the appropriate stack slot.
3267 if (!FPDiff) return Chain;
3268 // Calculate the new stack slot for the return address.
3269 int NewReturnAddrFI =
3270 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3272 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3273 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3274 MachinePointerInfo::getFixedStack(
3275 DAG.getMachineFunction(), NewReturnAddrFI));
3279 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3280 /// operation of specified width.
3281 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3283 unsigned NumElems = VT.getVectorNumElements();
3284 SmallVector<int, 8> Mask;
3285 Mask.push_back(NumElems);
3286 for (unsigned i = 1; i != NumElems; ++i)
3288 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3292 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3293 SmallVectorImpl<SDValue> &InVals) const {
3294 SelectionDAG &DAG = CLI.DAG;
3296 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3297 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3298 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3299 SDValue Chain = CLI.Chain;
3300 SDValue Callee = CLI.Callee;
3301 CallingConv::ID CallConv = CLI.CallConv;
3302 bool &isTailCall = CLI.IsTailCall;
3303 bool isVarArg = CLI.IsVarArg;
3305 MachineFunction &MF = DAG.getMachineFunction();
3306 bool Is64Bit = Subtarget.is64Bit();
3307 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3308 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3309 bool IsSibcall = false;
3310 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3311 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3312 const CallInst *CI =
3313 CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3314 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3315 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3316 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3318 if (CallConv == CallingConv::X86_INTR)
3319 report_fatal_error("X86 interrupts may not be called directly");
3321 if (Attr.getValueAsString() == "true")
3324 if (Subtarget.isPICStyleGOT() &&
3325 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3326 // If we are using a GOT, disable tail calls to external symbols with
3327 // default visibility. Tail calling such a symbol requires using a GOT
3328 // relocation, which forces early binding of the symbol. This breaks code
3329 // that require lazy function symbol resolution. Using musttail or
3330 // GuaranteedTailCallOpt will override this.
3331 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3332 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3333 G->getGlobal()->hasDefaultVisibility()))
3337 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3339 // Force this to be a tail call. The verifier rules are enough to ensure
3340 // that we can lower this successfully without moving the return address
3343 } else if (isTailCall) {
3344 // Check if it's really possible to do a tail call.
3345 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3346 isVarArg, SR != NotStructReturn,
3347 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3348 Outs, OutVals, Ins, DAG);
3350 // Sibcalls are automatically detected tailcalls which do not require
3352 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3359 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3360 "Var args not supported with calling convention fastcc, ghc or hipe");
3362 // Analyze operands of the call, assigning locations to each operand.
3363 SmallVector<CCValAssign, 16> ArgLocs;
3364 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3366 // Allocate shadow area for Win64.
3368 CCInfo.AllocateStack(32, 8);
3370 CCInfo.AnalyzeArguments(Outs, CC_X86);
3372 // In vectorcall calling convention a second pass is required for the HVA
3374 if (CallingConv::X86_VectorCall == CallConv) {
3375 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3378 // Get a count of how many bytes are to be pushed on the stack.
3379 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3381 // This is a sibcall. The memory operands are available in caller's
3382 // own caller's stack.
3384 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3385 canGuaranteeTCO(CallConv))
3386 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3389 if (isTailCall && !IsSibcall && !IsMustTail) {
3390 // Lower arguments at fp - stackoffset + fpdiff.
3391 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3393 FPDiff = NumBytesCallerPushed - NumBytes;
3395 // Set the delta of movement of the returnaddr stackslot.
3396 // But only set if delta is greater than previous delta.
3397 if (FPDiff < X86Info->getTCReturnAddrDelta())
3398 X86Info->setTCReturnAddrDelta(FPDiff);
3401 unsigned NumBytesToPush = NumBytes;
3402 unsigned NumBytesToPop = NumBytes;
3404 // If we have an inalloca argument, all stack space has already been allocated
3405 // for us and be right at the top of the stack. We don't support multiple
3406 // arguments passed in memory when using inalloca.
3407 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3409 if (!ArgLocs.back().isMemLoc())
3410 report_fatal_error("cannot use inalloca attribute on a register "
3412 if (ArgLocs.back().getLocMemOffset() != 0)
3413 report_fatal_error("any parameter with the inalloca attribute must be "
3414 "the only memory argument");
3418 Chain = DAG.getCALLSEQ_START(
3419 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3421 SDValue RetAddrFrIdx;
3422 // Load return address for tail calls.
3423 if (isTailCall && FPDiff)
3424 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3425 Is64Bit, FPDiff, dl);
3427 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3428 SmallVector<SDValue, 8> MemOpChains;
3431 // The next loop assumes that the locations are in the same order of the
3433 assert(isSortedByValueNo(ArgLocs) &&
3434 "Argument Location list must be sorted before lowering");
3436 // Walk the register/memloc assignments, inserting copies/loads. In the case
3437 // of tail call optimization arguments are handle later.
3438 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3439 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3441 assert(OutIndex < Outs.size() && "Invalid Out index");
3442 // Skip inalloca arguments, they have already been written.
3443 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3444 if (Flags.isInAlloca())
3447 CCValAssign &VA = ArgLocs[I];
3448 EVT RegVT = VA.getLocVT();
3449 SDValue Arg = OutVals[OutIndex];
3450 bool isByVal = Flags.isByVal();
3452 // Promote the value if needed.
3453 switch (VA.getLocInfo()) {
3454 default: llvm_unreachable("Unknown loc info!");
3455 case CCValAssign::Full: break;
3456 case CCValAssign::SExt:
3457 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3459 case CCValAssign::ZExt:
3460 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3462 case CCValAssign::AExt:
3463 if (Arg.getValueType().isVector() &&
3464 Arg.getValueType().getVectorElementType() == MVT::i1)
3465 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3466 else if (RegVT.is128BitVector()) {
3467 // Special case: passing MMX values in XMM registers.
3468 Arg = DAG.getBitcast(MVT::i64, Arg);
3469 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3470 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3472 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3474 case CCValAssign::BCvt:
3475 Arg = DAG.getBitcast(RegVT, Arg);
3477 case CCValAssign::Indirect: {
3478 // Store the argument.
3479 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3480 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3481 Chain = DAG.getStore(
3482 Chain, dl, Arg, SpillSlot,
3483 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3489 if (VA.needsCustom()) {
3490 assert(VA.getValVT() == MVT::v64i1 &&
3491 "Currently the only custom case is when we split v64i1 to 2 regs");
3492 // Split v64i1 value into two registers
3493 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3495 } else if (VA.isRegLoc()) {
3496 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3497 if (isVarArg && IsWin64) {
3498 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3499 // shadow reg if callee is a varargs function.
3500 unsigned ShadowReg = 0;
3501 switch (VA.getLocReg()) {
3502 case X86::XMM0: ShadowReg = X86::RCX; break;
3503 case X86::XMM1: ShadowReg = X86::RDX; break;
3504 case X86::XMM2: ShadowReg = X86::R8; break;
3505 case X86::XMM3: ShadowReg = X86::R9; break;
3508 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3510 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3511 assert(VA.isMemLoc());
3512 if (!StackPtr.getNode())
3513 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3514 getPointerTy(DAG.getDataLayout()));
3515 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3516 dl, DAG, VA, Flags));
3520 if (!MemOpChains.empty())
3521 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3523 if (Subtarget.isPICStyleGOT()) {
3524 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3527 RegsToPass.push_back(std::make_pair(
3528 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3529 getPointerTy(DAG.getDataLayout()))));
3531 // If we are tail calling and generating PIC/GOT style code load the
3532 // address of the callee into ECX. The value in ecx is used as target of
3533 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3534 // for tail calls on PIC/GOT architectures. Normally we would just put the
3535 // address of GOT into ebx and then call target@PLT. But for tail calls
3536 // ebx would be restored (since ebx is callee saved) before jumping to the
3539 // Note: The actual moving to ECX is done further down.
3540 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3541 if (G && !G->getGlobal()->hasLocalLinkage() &&
3542 G->getGlobal()->hasDefaultVisibility())
3543 Callee = LowerGlobalAddress(Callee, DAG);
3544 else if (isa<ExternalSymbolSDNode>(Callee))
3545 Callee = LowerExternalSymbol(Callee, DAG);
3549 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3550 // From AMD64 ABI document:
3551 // For calls that may call functions that use varargs or stdargs
3552 // (prototype-less calls or calls to functions containing ellipsis (...) in
3553 // the declaration) %al is used as hidden argument to specify the number
3554 // of SSE registers used. The contents of %al do not need to match exactly
3555 // the number of registers, but must be an ubound on the number of SSE
3556 // registers used and is in the range 0 - 8 inclusive.
3558 // Count the number of XMM registers allocated.
3559 static const MCPhysReg XMMArgRegs[] = {
3560 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3561 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3563 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3564 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3565 && "SSE registers cannot be used when SSE is disabled");
3567 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3568 DAG.getConstant(NumXMMRegs, dl,
3572 if (isVarArg && IsMustTail) {
3573 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3574 for (const auto &F : Forwards) {
3575 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3576 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3580 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3581 // don't need this because the eligibility check rejects calls that require
3582 // shuffling arguments passed in memory.
3583 if (!IsSibcall && isTailCall) {
3584 // Force all the incoming stack arguments to be loaded from the stack
3585 // before any new outgoing arguments are stored to the stack, because the
3586 // outgoing stack slots may alias the incoming argument stack slots, and
3587 // the alias isn't otherwise explicit. This is slightly more conservative
3588 // than necessary, because it means that each store effectively depends
3589 // on every argument instead of just those arguments it would clobber.
3590 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3592 SmallVector<SDValue, 8> MemOpChains2;
3595 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3597 CCValAssign &VA = ArgLocs[I];
3599 if (VA.isRegLoc()) {
3600 if (VA.needsCustom()) {
3601 assert((CallConv == CallingConv::X86_RegCall) &&
3602 "Expecting custom case only in regcall calling convention");
3603 // This means that we are in special case where one argument was
3604 // passed through two register locations - Skip the next location
3611 assert(VA.isMemLoc());
3612 SDValue Arg = OutVals[OutsIndex];
3613 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3614 // Skip inalloca arguments. They don't require any work.
3615 if (Flags.isInAlloca())
3617 // Create frame index.
3618 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3619 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3620 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3621 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3623 if (Flags.isByVal()) {
3624 // Copy relative to framepointer.
3625 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3626 if (!StackPtr.getNode())
3627 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3628 getPointerTy(DAG.getDataLayout()));
3629 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3632 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3636 // Store relative to framepointer.
3637 MemOpChains2.push_back(DAG.getStore(
3638 ArgChain, dl, Arg, FIN,
3639 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3643 if (!MemOpChains2.empty())
3644 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3646 // Store the return address to the appropriate stack slot.
3647 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3648 getPointerTy(DAG.getDataLayout()),
3649 RegInfo->getSlotSize(), FPDiff, dl);
3652 // Build a sequence of copy-to-reg nodes chained together with token chain
3653 // and flag operands which copy the outgoing args into registers.
3655 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3656 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3657 RegsToPass[i].second, InFlag);
3658 InFlag = Chain.getValue(1);
3661 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3662 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3663 // In the 64-bit large code model, we have to make all calls
3664 // through a register, since the call instruction's 32-bit
3665 // pc-relative offset may not be large enough to hold the whole
3667 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3668 // If the callee is a GlobalAddress node (quite common, every direct call
3669 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3671 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3673 // We should use extra load for direct calls to dllimported functions in
3675 const GlobalValue *GV = G->getGlobal();
3676 if (!GV->hasDLLImportStorageClass()) {
3677 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3679 Callee = DAG.getTargetGlobalAddress(
3680 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3682 if (OpFlags == X86II::MO_GOTPCREL) {
3684 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3685 getPointerTy(DAG.getDataLayout()), Callee);
3686 // Add extra indirection
3687 Callee = DAG.getLoad(
3688 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3689 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3692 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3693 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3694 unsigned char OpFlags =
3695 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3697 Callee = DAG.getTargetExternalSymbol(
3698 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3699 } else if (Subtarget.isTarget64BitILP32() &&
3700 Callee->getValueType(0) == MVT::i32) {
3701 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3702 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3705 // Returns a chain & a flag for retval copy to use.
3706 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3707 SmallVector<SDValue, 8> Ops;
3709 if (!IsSibcall && isTailCall) {
3710 Chain = DAG.getCALLSEQ_END(Chain,
3711 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3712 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3713 InFlag = Chain.getValue(1);
3716 Ops.push_back(Chain);
3717 Ops.push_back(Callee);
3720 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3722 // Add argument registers to the end of the list so that they are known live
3724 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3725 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3726 RegsToPass[i].second.getValueType()));
3728 // Add a register mask operand representing the call-preserved registers.
3729 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3730 // set X86_INTR calling convention because it has the same CSR mask
3731 // (same preserved registers).
3732 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3733 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3734 assert(Mask && "Missing call preserved mask for calling convention");
3736 // If this is an invoke in a 32-bit function using a funclet-based
3737 // personality, assume the function clobbers all registers. If an exception
3738 // is thrown, the runtime will not restore CSRs.
3739 // FIXME: Model this more precisely so that we can register allocate across
3740 // the normal edge and spill and fill across the exceptional edge.
3741 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3742 const Function *CallerFn = MF.getFunction();
3743 EHPersonality Pers =
3744 CallerFn->hasPersonalityFn()
3745 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3746 : EHPersonality::Unknown;
3747 if (isFuncletEHPersonality(Pers))
3748 Mask = RegInfo->getNoPreservedMask();
3751 // Define a new register mask from the existing mask.
3752 uint32_t *RegMask = nullptr;
3754 // In some calling conventions we need to remove the used physical registers
3755 // from the reg mask.
3756 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3757 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3759 // Allocate a new Reg Mask and copy Mask.
3760 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3761 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3762 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3764 // Make sure all sub registers of the argument registers are reset
3766 for (auto const &RegPair : RegsToPass)
3767 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3768 SubRegs.isValid(); ++SubRegs)
3769 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3771 // Create the RegMask Operand according to our updated mask.
3772 Ops.push_back(DAG.getRegisterMask(RegMask));
3774 // Create the RegMask Operand according to the static mask.
3775 Ops.push_back(DAG.getRegisterMask(Mask));
3778 if (InFlag.getNode())
3779 Ops.push_back(InFlag);
3783 //// If this is the first return lowered for this function, add the regs
3784 //// to the liveout set for the function.
3785 // This isn't right, although it's probably harmless on x86; liveouts
3786 // should be computed from returns not tail calls. Consider a void
3787 // function making a tail call to a function returning int.
3788 MF.getFrameInfo().setHasTailCall();
3789 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3792 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3793 InFlag = Chain.getValue(1);
3795 // Create the CALLSEQ_END node.
3796 unsigned NumBytesForCalleeToPop;
3797 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3798 DAG.getTarget().Options.GuaranteedTailCallOpt))
3799 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3800 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3801 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3802 SR == StackStructReturn)
3803 // If this is a call to a struct-return function, the callee
3804 // pops the hidden struct pointer, so we have to push it back.
3805 // This is common for Darwin/X86, Linux & Mingw32 targets.
3806 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3807 NumBytesForCalleeToPop = 4;
3809 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3811 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3812 // No need to reset the stack after the call if the call doesn't return. To
3813 // make the MI verify, we'll pretend the callee does it for us.
3814 NumBytesForCalleeToPop = NumBytes;
3817 // Returns a flag for retval copy to use.
3819 Chain = DAG.getCALLSEQ_END(Chain,
3820 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3821 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3824 InFlag = Chain.getValue(1);
3827 // Handle result values, copying them out of physregs into vregs that we
3829 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3833 //===----------------------------------------------------------------------===//
3834 // Fast Calling Convention (tail call) implementation
3835 //===----------------------------------------------------------------------===//
3837 // Like std call, callee cleans arguments, convention except that ECX is
3838 // reserved for storing the tail called function address. Only 2 registers are
3839 // free for argument passing (inreg). Tail call optimization is performed
3841 // * tailcallopt is enabled
3842 // * caller/callee are fastcc
3843 // On X86_64 architecture with GOT-style position independent code only local
3844 // (within module) calls are supported at the moment.
3845 // To keep the stack aligned according to platform abi the function
3846 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3847 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3848 // If a tail called function callee has more arguments than the caller the
3849 // caller needs to make sure that there is room to move the RETADDR to. This is
3850 // achieved by reserving an area the size of the argument delta right after the
3851 // original RETADDR, but before the saved framepointer or the spilled registers
3852 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3864 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3867 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3868 SelectionDAG& DAG) const {
3869 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3870 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3871 unsigned StackAlignment = TFI.getStackAlignment();
3872 uint64_t AlignMask = StackAlignment - 1;
3873 int64_t Offset = StackSize;
3874 unsigned SlotSize = RegInfo->getSlotSize();
3875 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3876 // Number smaller than 12 so just add the difference.
3877 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3879 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3880 Offset = ((~AlignMask) & Offset) + StackAlignment +
3881 (StackAlignment-SlotSize);
3886 /// Return true if the given stack call argument is already available in the
3887 /// same position (relatively) of the caller's incoming argument stack.
3889 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3890 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3891 const X86InstrInfo *TII, const CCValAssign &VA) {
3892 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3895 // Look through nodes that don't alter the bits of the incoming value.
3896 unsigned Op = Arg.getOpcode();
3897 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3898 Arg = Arg.getOperand(0);
3901 if (Op == ISD::TRUNCATE) {
3902 const SDValue &TruncInput = Arg.getOperand(0);
3903 if (TruncInput.getOpcode() == ISD::AssertZext &&
3904 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3905 Arg.getValueType()) {
3906 Arg = TruncInput.getOperand(0);
3914 if (Arg.getOpcode() == ISD::CopyFromReg) {
3915 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3916 if (!TargetRegisterInfo::isVirtualRegister(VR))
3918 MachineInstr *Def = MRI->getVRegDef(VR);
3921 if (!Flags.isByVal()) {
3922 if (!TII->isLoadFromStackSlot(*Def, FI))
3925 unsigned Opcode = Def->getOpcode();
3926 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3927 Opcode == X86::LEA64_32r) &&
3928 Def->getOperand(1).isFI()) {
3929 FI = Def->getOperand(1).getIndex();
3930 Bytes = Flags.getByValSize();
3934 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3935 if (Flags.isByVal())
3936 // ByVal argument is passed in as a pointer but it's now being
3937 // dereferenced. e.g.
3938 // define @foo(%struct.X* %A) {
3939 // tail call @bar(%struct.X* byval %A)
3942 SDValue Ptr = Ld->getBasePtr();
3943 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3946 FI = FINode->getIndex();
3947 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3948 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3949 FI = FINode->getIndex();
3950 Bytes = Flags.getByValSize();
3954 assert(FI != INT_MAX);
3955 if (!MFI.isFixedObjectIndex(FI))
3958 if (Offset != MFI.getObjectOffset(FI))
3961 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3962 // If the argument location is wider than the argument type, check that any
3963 // extension flags match.
3964 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3965 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3970 return Bytes == MFI.getObjectSize(FI);
3973 /// Check whether the call is eligible for tail call optimization. Targets
3974 /// that want to do tail call optimization should implement this function.
3975 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3976 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3977 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3978 const SmallVectorImpl<ISD::OutputArg> &Outs,
3979 const SmallVectorImpl<SDValue> &OutVals,
3980 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3981 if (!mayTailCallThisCC(CalleeCC))
3984 // If -tailcallopt is specified, make fastcc functions tail-callable.
3985 MachineFunction &MF = DAG.getMachineFunction();
3986 const Function *CallerF = MF.getFunction();
3988 // If the function return type is x86_fp80 and the callee return type is not,
3989 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3990 // perform a tailcall optimization here.
3991 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3994 CallingConv::ID CallerCC = CallerF->getCallingConv();
3995 bool CCMatch = CallerCC == CalleeCC;
3996 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3997 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3999 // Win64 functions have extra shadow space for argument homing. Don't do the
4000 // sibcall if the caller and callee have mismatched expectations for this
4002 if (IsCalleeWin64 != IsCallerWin64)
4005 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4006 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4011 // Look for obvious safe cases to perform tail call optimization that do not
4012 // require ABI changes. This is what gcc calls sibcall.
4014 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4015 // emit a special epilogue.
4016 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4017 if (RegInfo->needsStackRealignment(MF))
4020 // Also avoid sibcall optimization if either caller or callee uses struct
4021 // return semantics.
4022 if (isCalleeStructRet || isCallerStructRet)
4025 // Do not sibcall optimize vararg calls unless all arguments are passed via
4027 LLVMContext &C = *DAG.getContext();
4028 if (isVarArg && !Outs.empty()) {
4029 // Optimizing for varargs on Win64 is unlikely to be safe without
4030 // additional testing.
4031 if (IsCalleeWin64 || IsCallerWin64)
4034 SmallVector<CCValAssign, 16> ArgLocs;
4035 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4037 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4038 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4039 if (!ArgLocs[i].isRegLoc())
4043 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4044 // stack. Therefore, if it's not used by the call it is not safe to optimize
4045 // this into a sibcall.
4046 bool Unused = false;
4047 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4054 SmallVector<CCValAssign, 16> RVLocs;
4055 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4056 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4057 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4058 CCValAssign &VA = RVLocs[i];
4059 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4064 // Check that the call results are passed in the same way.
4065 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4066 RetCC_X86, RetCC_X86))
4068 // The callee has to preserve all registers the caller needs to preserve.
4069 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4070 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4072 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4073 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4077 unsigned StackArgsSize = 0;
4079 // If the callee takes no arguments then go on to check the results of the
4081 if (!Outs.empty()) {
4082 // Check if stack adjustment is needed. For now, do not do this if any
4083 // argument is passed on the stack.
4084 SmallVector<CCValAssign, 16> ArgLocs;
4085 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4087 // Allocate shadow area for Win64
4089 CCInfo.AllocateStack(32, 8);
4091 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4092 StackArgsSize = CCInfo.getNextStackOffset();
4094 if (CCInfo.getNextStackOffset()) {
4095 // Check if the arguments are already laid out in the right way as
4096 // the caller's fixed stack objects.
4097 MachineFrameInfo &MFI = MF.getFrameInfo();
4098 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4099 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4100 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4101 CCValAssign &VA = ArgLocs[i];
4102 SDValue Arg = OutVals[i];
4103 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4104 if (VA.getLocInfo() == CCValAssign::Indirect)
4106 if (!VA.isRegLoc()) {
4107 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4114 bool PositionIndependent = isPositionIndependent();
4115 // If the tailcall address may be in a register, then make sure it's
4116 // possible to register allocate for it. In 32-bit, the call address can
4117 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4118 // callee-saved registers are restored. These happen to be the same
4119 // registers used to pass 'inreg' arguments so watch out for those.
4120 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4121 !isa<ExternalSymbolSDNode>(Callee)) ||
4122 PositionIndependent)) {
4123 unsigned NumInRegs = 0;
4124 // In PIC we need an extra register to formulate the address computation
4126 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4128 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4129 CCValAssign &VA = ArgLocs[i];
4132 unsigned Reg = VA.getLocReg();
4135 case X86::EAX: case X86::EDX: case X86::ECX:
4136 if (++NumInRegs == MaxInRegs)
4143 const MachineRegisterInfo &MRI = MF.getRegInfo();
4144 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4148 bool CalleeWillPop =
4149 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4150 MF.getTarget().Options.GuaranteedTailCallOpt);
4152 if (unsigned BytesToPop =
4153 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4154 // If we have bytes to pop, the callee must pop them.
4155 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4156 if (!CalleePopMatches)
4158 } else if (CalleeWillPop && StackArgsSize > 0) {
4159 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4167 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4168 const TargetLibraryInfo *libInfo) const {
4169 return X86::createFastISel(funcInfo, libInfo);
4172 //===----------------------------------------------------------------------===//
4173 // Other Lowering Hooks
4174 //===----------------------------------------------------------------------===//
4176 static bool MayFoldLoad(SDValue Op) {
4177 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4180 static bool MayFoldIntoStore(SDValue Op) {
4181 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4184 static bool MayFoldIntoZeroExtend(SDValue Op) {
4185 if (Op.hasOneUse()) {
4186 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4187 return (ISD::ZERO_EXTEND == Opcode);
4192 static bool isTargetShuffle(unsigned Opcode) {
4194 default: return false;
4195 case X86ISD::BLENDI:
4196 case X86ISD::PSHUFB:
4197 case X86ISD::PSHUFD:
4198 case X86ISD::PSHUFHW:
4199 case X86ISD::PSHUFLW:
4201 case X86ISD::INSERTPS:
4202 case X86ISD::PALIGNR:
4203 case X86ISD::VSHLDQ:
4204 case X86ISD::VSRLDQ:
4205 case X86ISD::MOVLHPS:
4206 case X86ISD::MOVLHPD:
4207 case X86ISD::MOVHLPS:
4208 case X86ISD::MOVLPS:
4209 case X86ISD::MOVLPD:
4210 case X86ISD::MOVSHDUP:
4211 case X86ISD::MOVSLDUP:
4212 case X86ISD::MOVDDUP:
4215 case X86ISD::UNPCKL:
4216 case X86ISD::UNPCKH:
4217 case X86ISD::VBROADCAST:
4218 case X86ISD::VPERMILPI:
4219 case X86ISD::VPERMILPV:
4220 case X86ISD::VPERM2X128:
4221 case X86ISD::VPERMIL2:
4222 case X86ISD::VPERMI:
4223 case X86ISD::VPPERM:
4224 case X86ISD::VPERMV:
4225 case X86ISD::VPERMV3:
4226 case X86ISD::VPERMIV3:
4227 case X86ISD::VZEXT_MOVL:
4232 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4234 default: return false;
4236 case X86ISD::PSHUFB:
4237 case X86ISD::VPERMILPV:
4238 case X86ISD::VPERMIL2:
4239 case X86ISD::VPPERM:
4240 case X86ISD::VPERMV:
4241 case X86ISD::VPERMV3:
4242 case X86ISD::VPERMIV3:
4244 // 'Faux' Target Shuffles.
4251 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4252 MachineFunction &MF = DAG.getMachineFunction();
4253 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4254 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4255 int ReturnAddrIndex = FuncInfo->getRAIndex();
4257 if (ReturnAddrIndex == 0) {
4258 // Set up a frame object for the return address.
4259 unsigned SlotSize = RegInfo->getSlotSize();
4260 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4263 FuncInfo->setRAIndex(ReturnAddrIndex);
4266 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4269 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4270 bool hasSymbolicDisplacement) {
4271 // Offset should fit into 32 bit immediate field.
4272 if (!isInt<32>(Offset))
4275 // If we don't have a symbolic displacement - we don't have any extra
4277 if (!hasSymbolicDisplacement)
4280 // FIXME: Some tweaks might be needed for medium code model.
4281 if (M != CodeModel::Small && M != CodeModel::Kernel)
4284 // For small code model we assume that latest object is 16MB before end of 31
4285 // bits boundary. We may also accept pretty large negative constants knowing
4286 // that all objects are in the positive half of address space.
4287 if (M == CodeModel::Small && Offset < 16*1024*1024)
4290 // For kernel code model we know that all object resist in the negative half
4291 // of 32bits address space. We may not accept negative offsets, since they may
4292 // be just off and we may accept pretty large positive ones.
4293 if (M == CodeModel::Kernel && Offset >= 0)
4299 /// Determines whether the callee is required to pop its own arguments.
4300 /// Callee pop is necessary to support tail calls.
4301 bool X86::isCalleePop(CallingConv::ID CallingConv,
4302 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4303 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4304 // can guarantee TCO.
4305 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4308 switch (CallingConv) {
4311 case CallingConv::X86_StdCall:
4312 case CallingConv::X86_FastCall:
4313 case CallingConv::X86_ThisCall:
4314 case CallingConv::X86_VectorCall:
4319 /// \brief Return true if the condition is an unsigned comparison operation.
4320 static bool isX86CCUnsigned(unsigned X86CC) {
4323 llvm_unreachable("Invalid integer condition!");
4339 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4340 switch (SetCCOpcode) {
4341 default: llvm_unreachable("Invalid integer condition!");
4342 case ISD::SETEQ: return X86::COND_E;
4343 case ISD::SETGT: return X86::COND_G;
4344 case ISD::SETGE: return X86::COND_GE;
4345 case ISD::SETLT: return X86::COND_L;
4346 case ISD::SETLE: return X86::COND_LE;
4347 case ISD::SETNE: return X86::COND_NE;
4348 case ISD::SETULT: return X86::COND_B;
4349 case ISD::SETUGT: return X86::COND_A;
4350 case ISD::SETULE: return X86::COND_BE;
4351 case ISD::SETUGE: return X86::COND_AE;
4355 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4356 /// condition code, returning the condition code and the LHS/RHS of the
4357 /// comparison to make.
4358 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4359 bool isFP, SDValue &LHS, SDValue &RHS,
4360 SelectionDAG &DAG) {
4362 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4363 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4364 // X > -1 -> X == 0, jump !sign.
4365 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4366 return X86::COND_NS;
4368 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4369 // X < 0 -> X == 0, jump on sign.
4372 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4374 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4375 return X86::COND_LE;
4379 return TranslateIntegerX86CC(SetCCOpcode);
4382 // First determine if it is required or is profitable to flip the operands.
4384 // If LHS is a foldable load, but RHS is not, flip the condition.
4385 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4386 !ISD::isNON_EXTLoad(RHS.getNode())) {
4387 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4388 std::swap(LHS, RHS);
4391 switch (SetCCOpcode) {
4397 std::swap(LHS, RHS);
4401 // On a floating point condition, the flags are set as follows:
4403 // 0 | 0 | 0 | X > Y
4404 // 0 | 0 | 1 | X < Y
4405 // 1 | 0 | 0 | X == Y
4406 // 1 | 1 | 1 | unordered
4407 switch (SetCCOpcode) {
4408 default: llvm_unreachable("Condcode should be pre-legalized away");
4410 case ISD::SETEQ: return X86::COND_E;
4411 case ISD::SETOLT: // flipped
4413 case ISD::SETGT: return X86::COND_A;
4414 case ISD::SETOLE: // flipped
4416 case ISD::SETGE: return X86::COND_AE;
4417 case ISD::SETUGT: // flipped
4419 case ISD::SETLT: return X86::COND_B;
4420 case ISD::SETUGE: // flipped
4422 case ISD::SETLE: return X86::COND_BE;
4424 case ISD::SETNE: return X86::COND_NE;
4425 case ISD::SETUO: return X86::COND_P;
4426 case ISD::SETO: return X86::COND_NP;
4428 case ISD::SETUNE: return X86::COND_INVALID;
4432 /// Is there a floating point cmov for the specific X86 condition code?
4433 /// Current x86 isa includes the following FP cmov instructions:
4434 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4435 static bool hasFPCMov(unsigned X86CC) {
4452 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4454 unsigned Intrinsic) const {
4456 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4460 Info.opc = ISD::INTRINSIC_W_CHAIN;
4461 Info.readMem = false;
4462 Info.writeMem = false;
4466 switch (IntrData->Type) {
4467 case EXPAND_FROM_MEM: {
4468 Info.ptrVal = I.getArgOperand(0);
4469 Info.memVT = MVT::getVT(I.getType());
4471 Info.readMem = true;
4474 case COMPRESS_TO_MEM: {
4475 Info.ptrVal = I.getArgOperand(0);
4476 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4478 Info.writeMem = true;
4481 case TRUNCATE_TO_MEM_VI8:
4482 case TRUNCATE_TO_MEM_VI16:
4483 case TRUNCATE_TO_MEM_VI32: {
4484 Info.ptrVal = I.getArgOperand(0);
4485 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4486 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4487 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4489 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4490 ScalarVT = MVT::i16;
4491 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4492 ScalarVT = MVT::i32;
4494 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4496 Info.writeMem = true;
4506 /// Returns true if the target can instruction select the
4507 /// specified FP immediate natively. If false, the legalizer will
4508 /// materialize the FP immediate as a load from a constant pool.
4509 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4510 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4511 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4517 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4518 ISD::LoadExtType ExtTy,
4520 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4521 // relocation target a movq or addq instruction: don't let the load shrink.
4522 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4523 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4524 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4525 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4529 /// \brief Returns true if it is beneficial to convert a load of a constant
4530 /// to just the constant itself.
4531 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4533 assert(Ty->isIntegerTy());
4535 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4536 if (BitSize == 0 || BitSize > 64)
4541 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4542 unsigned Index) const {
4543 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4546 return (Index == 0 || Index == ResVT.getVectorNumElements());
4549 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4550 // Speculate cttz only if we can directly use TZCNT.
4551 return Subtarget.hasBMI();
4554 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4555 // Speculate ctlz only if we can directly use LZCNT.
4556 return Subtarget.hasLZCNT();
4559 bool X86TargetLowering::isCtlzFast() const {
4560 return Subtarget.hasFastLZCNT();
4563 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4564 const Instruction &AndI) const {
4568 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4569 if (!Subtarget.hasBMI())
4572 // There are only 32-bit and 64-bit forms for 'andn'.
4573 EVT VT = Y.getValueType();
4574 if (VT != MVT::i32 && VT != MVT::i64)
4580 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4581 MVT VT = MVT::getIntegerVT(NumBits);
4582 if (isTypeLegal(VT))
4585 // PMOVMSKB can handle this.
4586 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4589 // VPMOVMSKB can handle this.
4590 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4593 // TODO: Allow 64-bit type for 32-bit target.
4594 // TODO: 512-bit types should be allowed, but make sure that those
4595 // cases are handled in combineVectorSizedSetCCEquality().
4597 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4600 /// Val is the undef sentinel value or equal to the specified value.
4601 static bool isUndefOrEqual(int Val, int CmpVal) {
4602 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4605 /// Val is either the undef or zero sentinel value.
4606 static bool isUndefOrZero(int Val) {
4607 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4610 /// Return true if every element in Mask, beginning
4611 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4612 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4613 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4614 if (Mask[i] != SM_SentinelUndef)
4619 /// Return true if Val is undef or if its value falls within the
4620 /// specified range (L, H].
4621 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4622 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4625 /// Return true if every element in Mask is undef or if its value
4626 /// falls within the specified range (L, H].
4627 static bool isUndefOrInRange(ArrayRef<int> Mask,
4630 if (!isUndefOrInRange(M, Low, Hi))
4635 /// Return true if Val is undef, zero or if its value falls within the
4636 /// specified range (L, H].
4637 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4638 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4641 /// Return true if every element in Mask is undef, zero or if its value
4642 /// falls within the specified range (L, H].
4643 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4645 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4650 /// Return true if every element in Mask, beginning
4651 /// from position Pos and ending in Pos+Size, falls within the specified
4652 /// sequential range (Low, Low+Size]. or is undef.
4653 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4654 unsigned Pos, unsigned Size, int Low) {
4655 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4656 if (!isUndefOrEqual(Mask[i], Low))
4661 /// Return true if every element in Mask, beginning
4662 /// from position Pos and ending in Pos+Size, falls within the specified
4663 /// sequential range (Low, Low+Size], or is undef or is zero.
4664 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4665 unsigned Size, int Low) {
4666 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4667 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4672 /// Return true if every element in Mask, beginning
4673 /// from position Pos and ending in Pos+Size is undef or is zero.
4674 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4676 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4677 if (!isUndefOrZero(Mask[i]))
4682 /// \brief Helper function to test whether a shuffle mask could be
4683 /// simplified by widening the elements being shuffled.
4685 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4686 /// leaves it in an unspecified state.
4688 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4689 /// shuffle masks. The latter have the special property of a '-2' representing
4690 /// a zero-ed lane of a vector.
4691 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4692 SmallVectorImpl<int> &WidenedMask) {
4693 WidenedMask.assign(Mask.size() / 2, 0);
4694 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4696 int M1 = Mask[i + 1];
4698 // If both elements are undef, its trivial.
4699 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4700 WidenedMask[i / 2] = SM_SentinelUndef;
4704 // Check for an undef mask and a mask value properly aligned to fit with
4705 // a pair of values. If we find such a case, use the non-undef mask's value.
4706 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4707 WidenedMask[i / 2] = M1 / 2;
4710 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4711 WidenedMask[i / 2] = M0 / 2;
4715 // When zeroing, we need to spread the zeroing across both lanes to widen.
4716 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4717 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4718 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4719 WidenedMask[i / 2] = SM_SentinelZero;
4725 // Finally check if the two mask values are adjacent and aligned with
4727 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4728 WidenedMask[i / 2] = M0 / 2;
4732 // Otherwise we can't safely widen the elements used in this shuffle.
4735 assert(WidenedMask.size() == Mask.size() / 2 &&
4736 "Incorrect size of mask after widening the elements!");
4741 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4742 /// mask index with the scaled sequential indices for an equivalent narrowed
4743 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4745 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4746 SmallVectorImpl<int> &ScaledMask) {
4747 assert(0 < Scale && "Unexpected scaling factor");
4748 int NumElts = Mask.size();
4749 ScaledMask.assign(NumElts * Scale, -1);
4751 for (int i = 0; i != NumElts; ++i) {
4754 // Repeat sentinel values in every mask element.
4756 for (int s = 0; s != Scale; ++s)
4757 ScaledMask[(Scale * i) + s] = M;
4761 // Scale mask element and increment across each mask element.
4762 for (int s = 0; s != Scale; ++s)
4763 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4767 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4768 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4769 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4770 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4771 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4774 // The index should be aligned on a vecWidth-bit boundary.
4775 uint64_t Index = N->getConstantOperandVal(1);
4776 MVT VT = N->getSimpleValueType(0);
4777 unsigned ElSize = VT.getScalarSizeInBits();
4778 return (Index * ElSize) % vecWidth == 0;
4781 /// Return true if the specified INSERT_SUBVECTOR
4782 /// operand specifies a subvector insert that is suitable for input to
4783 /// insertion of 128 or 256-bit subvectors
4784 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4785 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4786 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4789 // The index should be aligned on a vecWidth-bit boundary.
4790 uint64_t Index = N->getConstantOperandVal(2);
4791 MVT VT = N->getSimpleValueType(0);
4792 unsigned ElSize = VT.getScalarSizeInBits();
4793 return (Index * ElSize) % vecWidth == 0;
4796 bool X86::isVINSERT128Index(SDNode *N) {
4797 return isVINSERTIndex(N, 128);
4800 bool X86::isVINSERT256Index(SDNode *N) {
4801 return isVINSERTIndex(N, 256);
4804 bool X86::isVEXTRACT128Index(SDNode *N) {
4805 return isVEXTRACTIndex(N, 128);
4808 bool X86::isVEXTRACT256Index(SDNode *N) {
4809 return isVEXTRACTIndex(N, 256);
4812 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4813 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4814 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4815 "Illegal extract subvector for VEXTRACT");
4817 uint64_t Index = N->getConstantOperandVal(1);
4818 MVT VecVT = N->getOperand(0).getSimpleValueType();
4819 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4820 return Index / NumElemsPerChunk;
4823 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4824 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4825 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4826 "Illegal insert subvector for VINSERT");
4828 uint64_t Index = N->getConstantOperandVal(2);
4829 MVT VecVT = N->getSimpleValueType(0);
4830 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4831 return Index / NumElemsPerChunk;
4834 /// Return the appropriate immediate to extract the specified
4835 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4836 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4837 return getExtractVEXTRACTImmediate(N, 128);
4840 /// Return the appropriate immediate to extract the specified
4841 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4842 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4843 return getExtractVEXTRACTImmediate(N, 256);
4846 /// Return the appropriate immediate to insert at the specified
4847 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4848 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4849 return getInsertVINSERTImmediate(N, 128);
4852 /// Return the appropriate immediate to insert at the specified
4853 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4854 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4855 return getInsertVINSERTImmediate(N, 256);
4858 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4859 bool X86::isZeroNode(SDValue Elt) {
4860 return isNullConstant(Elt) || isNullFPConstant(Elt);
4863 // Build a vector of constants.
4864 // Use an UNDEF node if MaskElt == -1.
4865 // Split 64-bit constants in the 32-bit mode.
4866 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4867 const SDLoc &dl, bool IsMask = false) {
4869 SmallVector<SDValue, 32> Ops;
4872 MVT ConstVecVT = VT;
4873 unsigned NumElts = VT.getVectorNumElements();
4874 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4875 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4876 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4880 MVT EltVT = ConstVecVT.getVectorElementType();
4881 for (unsigned i = 0; i < NumElts; ++i) {
4882 bool IsUndef = Values[i] < 0 && IsMask;
4883 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4884 DAG.getConstant(Values[i], dl, EltVT);
4885 Ops.push_back(OpNode);
4887 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4888 DAG.getConstant(0, dl, EltVT));
4890 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4892 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4896 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4897 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4898 assert(Bits.size() == Undefs.getBitWidth() &&
4899 "Unequal constant and undef arrays");
4900 SmallVector<SDValue, 32> Ops;
4903 MVT ConstVecVT = VT;
4904 unsigned NumElts = VT.getVectorNumElements();
4905 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4906 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4907 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4911 MVT EltVT = ConstVecVT.getVectorElementType();
4912 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4914 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4917 const APInt &V = Bits[i];
4918 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4920 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4921 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4922 } else if (EltVT == MVT::f32) {
4923 APFloat FV(APFloat::IEEEsingle(), V);
4924 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4925 } else if (EltVT == MVT::f64) {
4926 APFloat FV(APFloat::IEEEdouble(), V);
4927 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4929 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4933 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4934 return DAG.getBitcast(VT, ConstsNode);
4937 /// Returns a vector of specified type with all zero elements.
4938 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4939 SelectionDAG &DAG, const SDLoc &dl) {
4940 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4941 VT.getVectorElementType() == MVT::i1) &&
4942 "Unexpected vector type");
4944 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4945 // type. This ensures they get CSE'd. But if the integer type is not
4946 // available, use a floating-point +0.0 instead.
4948 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4949 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4950 } else if (VT.getVectorElementType() == MVT::i1) {
4951 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4952 "Unexpected vector type");
4953 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4954 "Unexpected vector type");
4955 Vec = DAG.getConstant(0, dl, VT);
4957 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4958 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4960 return DAG.getBitcast(VT, Vec);
4963 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4964 const SDLoc &dl, unsigned vectorWidth) {
4965 EVT VT = Vec.getValueType();
4966 EVT ElVT = VT.getVectorElementType();
4967 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4968 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4969 VT.getVectorNumElements()/Factor);
4971 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4972 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4973 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4975 // This is the index of the first element of the vectorWidth-bit chunk
4976 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4977 IdxVal &= ~(ElemsPerChunk - 1);
4979 // If the input is a buildvector just emit a smaller one.
4980 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4981 return DAG.getBuildVector(
4982 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4984 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4985 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4988 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4989 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4990 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4991 /// instructions or a simple subregister reference. Idx is an index in the
4992 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4993 /// lowering EXTRACT_VECTOR_ELT operations easier.
4994 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4995 SelectionDAG &DAG, const SDLoc &dl) {
4996 assert((Vec.getValueType().is256BitVector() ||
4997 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4998 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5001 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5002 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5003 SelectionDAG &DAG, const SDLoc &dl) {
5004 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5005 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5008 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5009 SelectionDAG &DAG, const SDLoc &dl,
5010 unsigned vectorWidth) {
5011 assert((vectorWidth == 128 || vectorWidth == 256) &&
5012 "Unsupported vector width");
5013 // Inserting UNDEF is Result
5016 EVT VT = Vec.getValueType();
5017 EVT ElVT = VT.getVectorElementType();
5018 EVT ResultVT = Result.getValueType();
5020 // Insert the relevant vectorWidth bits.
5021 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5022 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5024 // This is the index of the first element of the vectorWidth-bit chunk
5025 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5026 IdxVal &= ~(ElemsPerChunk - 1);
5028 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5029 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5032 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5033 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5034 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5035 /// simple superregister reference. Idx is an index in the 128 bits
5036 /// we want. It need not be aligned to a 128-bit boundary. That makes
5037 /// lowering INSERT_VECTOR_ELT operations easier.
5038 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5039 SelectionDAG &DAG, const SDLoc &dl) {
5040 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5041 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5044 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5045 SelectionDAG &DAG, const SDLoc &dl) {
5046 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5047 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5050 /// Insert i1-subvector to i1-vector.
5051 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5052 const X86Subtarget &Subtarget) {
5055 SDValue Vec = Op.getOperand(0);
5056 SDValue SubVec = Op.getOperand(1);
5057 SDValue Idx = Op.getOperand(2);
5059 if (!isa<ConstantSDNode>(Idx))
5062 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5063 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5066 MVT OpVT = Op.getSimpleValueType();
5067 MVT SubVecVT = SubVec.getSimpleValueType();
5068 unsigned NumElems = OpVT.getVectorNumElements();
5069 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5071 assert(IdxVal + SubVecNumElems <= NumElems &&
5072 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5073 "Unexpected index value in INSERT_SUBVECTOR");
5075 // There are 3 possible cases:
5076 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5077 // 2. Subvector should be inserted in the upper part
5078 // (IdxVal + SubVecNumElems == NumElems)
5079 // 3. Subvector should be inserted in the middle (for example v2i1
5080 // to v16i1, index 2)
5082 // extend to natively supported kshift
5083 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5084 MVT WideOpVT = OpVT;
5085 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5088 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5089 SDValue Undef = DAG.getUNDEF(WideOpVT);
5090 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5091 Undef, SubVec, ZeroIdx);
5093 // Extract sub-vector if require.
5094 auto ExtractSubVec = [&](SDValue V) {
5095 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5099 if (Vec.isUndef()) {
5101 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5102 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5105 return ExtractSubVec(WideSubVec);
5108 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5109 NumElems = WideOpVT.getVectorNumElements();
5110 unsigned ShiftLeft = NumElems - SubVecNumElems;
5111 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5112 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5113 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5114 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5115 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5116 return ExtractSubVec(Vec);
5120 // Zero lower bits of the Vec
5121 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5122 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5123 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5124 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5125 // Merge them together, SubVec should be zero extended.
5126 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5127 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5129 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5130 return ExtractSubVec(Vec);
5133 // Simple case when we put subvector in the upper part
5134 if (IdxVal + SubVecNumElems == NumElems) {
5135 // Zero upper bits of the Vec
5136 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5137 DAG.getConstant(IdxVal, dl, MVT::i8));
5138 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5139 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5140 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5141 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5142 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5143 return ExtractSubVec(Vec);
5145 // Subvector should be inserted in the middle - use shuffle
5146 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5148 SmallVector<int, 64> Mask;
5149 for (unsigned i = 0; i < NumElems; ++i)
5150 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5152 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5155 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5156 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5157 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5158 /// large BUILD_VECTORS.
5159 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5160 unsigned NumElems, SelectionDAG &DAG,
5162 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5163 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5166 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5167 unsigned NumElems, SelectionDAG &DAG,
5169 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5170 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5173 /// Returns a vector of specified type with all bits set.
5174 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5175 /// Then bitcast to their original type, ensuring they get CSE'd.
5176 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5177 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5178 "Expected a 128/256/512-bit vector type");
5180 APInt Ones = APInt::getAllOnesValue(32);
5181 unsigned NumElts = VT.getSizeInBits() / 32;
5182 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5183 return DAG.getBitcast(VT, Vec);
5186 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5187 SelectionDAG &DAG) {
5188 EVT InVT = In.getValueType();
5189 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5191 if (VT.is128BitVector() && InVT.is128BitVector())
5192 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5193 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5195 // For 256-bit vectors, we only need the lower (128-bit) input half.
5196 // For 512-bit vectors, we only need the lower input half or quarter.
5197 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5198 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5199 In = extractSubVector(In, 0, DAG, DL,
5200 std::max(128, (int)VT.getSizeInBits() / Scale));
5203 return DAG.getNode(Opc, DL, VT, In);
5206 /// Generate unpacklo/unpackhi shuffle mask.
5207 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5209 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5210 int NumElts = VT.getVectorNumElements();
5211 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5213 for (int i = 0; i < NumElts; ++i) {
5214 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5215 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5216 Pos += (Unary ? 0 : NumElts * (i % 2));
5217 Pos += (Lo ? 0 : NumEltsInLane / 2);
5218 Mask.push_back(Pos);
5222 /// Returns a vector_shuffle node for an unpackl operation.
5223 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5224 SDValue V1, SDValue V2) {
5225 SmallVector<int, 8> Mask;
5226 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5227 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5230 /// Returns a vector_shuffle node for an unpackh operation.
5231 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5232 SDValue V1, SDValue V2) {
5233 SmallVector<int, 8> Mask;
5234 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5235 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5238 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5239 /// This produces a shuffle where the low element of V2 is swizzled into the
5240 /// zero/undef vector, landing at element Idx.
5241 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5242 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5244 const X86Subtarget &Subtarget,
5245 SelectionDAG &DAG) {
5246 MVT VT = V2.getSimpleValueType();
5248 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5249 int NumElems = VT.getVectorNumElements();
5250 SmallVector<int, 16> MaskVec(NumElems);
5251 for (int i = 0; i != NumElems; ++i)
5252 // If this is the insertion idx, put the low elt of V2 here.
5253 MaskVec[i] = (i == Idx) ? NumElems : i;
5254 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5257 static SDValue peekThroughBitcasts(SDValue V) {
5258 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5259 V = V.getOperand(0);
5263 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5264 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5265 V.getOperand(0).hasOneUse())
5266 V = V.getOperand(0);
5270 static const Constant *getTargetConstantFromNode(SDValue Op) {
5271 Op = peekThroughBitcasts(Op);
5273 auto *Load = dyn_cast<LoadSDNode>(Op);
5277 SDValue Ptr = Load->getBasePtr();
5278 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5279 Ptr->getOpcode() == X86ISD::WrapperRIP)
5280 Ptr = Ptr->getOperand(0);
5282 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5283 if (!CNode || CNode->isMachineConstantPoolEntry())
5286 return dyn_cast<Constant>(CNode->getConstVal());
5289 // Extract raw constant bits from constant pools.
5290 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5292 SmallVectorImpl<APInt> &EltBits,
5293 bool AllowWholeUndefs = true,
5294 bool AllowPartialUndefs = true) {
5295 assert(EltBits.empty() && "Expected an empty EltBits vector");
5297 Op = peekThroughBitcasts(Op);
5299 EVT VT = Op.getValueType();
5300 unsigned SizeInBits = VT.getSizeInBits();
5301 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5302 unsigned NumElts = SizeInBits / EltSizeInBits;
5304 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5305 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5307 // Extract all the undef/constant element data and pack into single bitsets.
5308 APInt UndefBits(SizeInBits, 0);
5309 APInt MaskBits(SizeInBits, 0);
5311 // Split the undef/constant single bitset data into the target elements.
5312 auto SplitBitData = [&]() {
5313 // Don't split if we don't allow undef bits.
5314 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5315 if (UndefBits.getBoolValue() && !AllowUndefs)
5318 UndefElts = APInt(NumElts, 0);
5319 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5321 for (unsigned i = 0; i != NumElts; ++i) {
5322 unsigned BitOffset = i * EltSizeInBits;
5323 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5325 // Only treat an element as UNDEF if all bits are UNDEF.
5326 if (UndefEltBits.isAllOnesValue()) {
5327 if (!AllowWholeUndefs)
5329 UndefElts.setBit(i);
5333 // If only some bits are UNDEF then treat them as zero (or bail if not
5335 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5338 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5339 EltBits[i] = Bits.getZExtValue();
5344 // Collect constant bits and insert into mask/undef bit masks.
5345 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5346 unsigned BitOffset) {
5349 if (isa<UndefValue>(Cst)) {
5350 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5351 Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
5354 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5355 Mask.insertBits(CInt->getValue(), BitOffset);
5358 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5359 Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
5365 // Extract constant bits from build vector.
5366 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5367 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5368 const SDValue &Src = Op.getOperand(i);
5369 unsigned BitOffset = i * SrcEltSizeInBits;
5370 if (Src.isUndef()) {
5371 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5374 auto *Cst = cast<ConstantSDNode>(Src);
5375 APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5376 MaskBits.insertBits(Bits, BitOffset);
5378 return SplitBitData();
5381 // Extract constant bits from constant pool vector.
5382 if (auto *Cst = getTargetConstantFromNode(Op)) {
5383 Type *CstTy = Cst->getType();
5384 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5387 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5388 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
5389 if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
5390 i * CstEltSizeInBits))
5393 return SplitBitData();
5396 // Extract constant bits from a broadcasted constant pool scalar.
5397 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5398 EltSizeInBits <= SrcEltSizeInBits) {
5399 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5400 APInt Bits(SizeInBits, 0);
5401 APInt Undefs(SizeInBits, 0);
5402 if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
5403 for (unsigned i = 0; i != NumSrcElts; ++i) {
5404 MaskBits |= Bits.shl(i * SrcEltSizeInBits);
5405 UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
5407 return SplitBitData();
5412 // Extract a rematerialized scalar constant insertion.
5413 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5414 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5415 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5416 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5417 MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5418 MaskBits = MaskBits.zext(SizeInBits);
5419 return SplitBitData();
5425 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5426 unsigned MaskEltSizeInBits,
5427 SmallVectorImpl<uint64_t> &RawMask) {
5429 SmallVector<APInt, 64> EltBits;
5431 // Extract the raw target constant bits.
5432 // FIXME: We currently don't support UNDEF bits or mask entries.
5433 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5434 EltBits, /* AllowWholeUndefs */ false,
5435 /* AllowPartialUndefs */ false))
5438 // Insert the extracted elements into the mask.
5439 for (APInt Elt : EltBits)
5440 RawMask.push_back(Elt.getZExtValue());
5445 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5446 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5447 /// operands in \p Ops, and returns true.
5448 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5449 /// IsUnary for shuffles which use a single input multiple times, and in those
5450 /// cases it will adjust the mask to only have indices within that single input.
5451 /// It is an error to call this with non-empty Mask/Ops vectors.
5452 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5453 SmallVectorImpl<SDValue> &Ops,
5454 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5455 unsigned NumElems = VT.getVectorNumElements();
5458 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5459 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5462 bool IsFakeUnary = false;
5463 switch(N->getOpcode()) {
5464 case X86ISD::BLENDI:
5465 ImmN = N->getOperand(N->getNumOperands()-1);
5466 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5467 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5470 ImmN = N->getOperand(N->getNumOperands()-1);
5471 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5472 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5474 case X86ISD::INSERTPS:
5475 ImmN = N->getOperand(N->getNumOperands()-1);
5476 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5477 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5479 case X86ISD::UNPCKH:
5480 DecodeUNPCKHMask(VT, Mask);
5481 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5483 case X86ISD::UNPCKL:
5484 DecodeUNPCKLMask(VT, Mask);
5485 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5487 case X86ISD::MOVHLPS:
5488 DecodeMOVHLPSMask(NumElems, Mask);
5489 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5491 case X86ISD::MOVLHPS:
5492 DecodeMOVLHPSMask(NumElems, Mask);
5493 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5495 case X86ISD::PALIGNR:
5496 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5497 ImmN = N->getOperand(N->getNumOperands()-1);
5498 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5499 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5500 Ops.push_back(N->getOperand(1));
5501 Ops.push_back(N->getOperand(0));
5503 case X86ISD::VSHLDQ:
5504 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5505 ImmN = N->getOperand(N->getNumOperands() - 1);
5506 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5509 case X86ISD::VSRLDQ:
5510 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5511 ImmN = N->getOperand(N->getNumOperands() - 1);
5512 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5515 case X86ISD::PSHUFD:
5516 case X86ISD::VPERMILPI:
5517 ImmN = N->getOperand(N->getNumOperands()-1);
5518 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5521 case X86ISD::PSHUFHW:
5522 ImmN = N->getOperand(N->getNumOperands()-1);
5523 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5526 case X86ISD::PSHUFLW:
5527 ImmN = N->getOperand(N->getNumOperands()-1);
5528 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5531 case X86ISD::VZEXT_MOVL:
5532 DecodeZeroMoveLowMask(VT, Mask);
5535 case X86ISD::VBROADCAST: {
5536 SDValue N0 = N->getOperand(0);
5537 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5538 // add the pre-extracted value to the Ops vector.
5539 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5540 N0.getOperand(0).getValueType() == VT &&
5541 N0.getConstantOperandVal(1) == 0)
5542 Ops.push_back(N0.getOperand(0));
5544 // We only decode broadcasts of same-sized vectors, unless the broadcast
5545 // came from an extract from the original width. If we found one, we
5546 // pushed it the Ops vector above.
5547 if (N0.getValueType() == VT || !Ops.empty()) {
5548 DecodeVectorBroadcast(VT, Mask);
5554 case X86ISD::VPERMILPV: {
5556 SDValue MaskNode = N->getOperand(1);
5557 unsigned MaskEltSize = VT.getScalarSizeInBits();
5558 SmallVector<uint64_t, 32> RawMask;
5559 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5560 DecodeVPERMILPMask(VT, RawMask, Mask);
5563 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5564 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5569 case X86ISD::PSHUFB: {
5571 SDValue MaskNode = N->getOperand(1);
5572 SmallVector<uint64_t, 32> RawMask;
5573 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5574 DecodePSHUFBMask(RawMask, Mask);
5577 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5578 DecodePSHUFBMask(C, Mask);
5583 case X86ISD::VPERMI:
5584 ImmN = N->getOperand(N->getNumOperands()-1);
5585 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5590 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5592 case X86ISD::VPERM2X128:
5593 ImmN = N->getOperand(N->getNumOperands()-1);
5594 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5595 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5597 case X86ISD::MOVSLDUP:
5598 DecodeMOVSLDUPMask(VT, Mask);
5601 case X86ISD::MOVSHDUP:
5602 DecodeMOVSHDUPMask(VT, Mask);
5605 case X86ISD::MOVDDUP:
5606 DecodeMOVDDUPMask(VT, Mask);
5609 case X86ISD::MOVLHPD:
5610 case X86ISD::MOVLPD:
5611 case X86ISD::MOVLPS:
5612 // Not yet implemented
5614 case X86ISD::VPERMIL2: {
5615 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5616 unsigned MaskEltSize = VT.getScalarSizeInBits();
5617 SDValue MaskNode = N->getOperand(2);
5618 SDValue CtrlNode = N->getOperand(3);
5619 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5620 unsigned CtrlImm = CtrlOp->getZExtValue();
5621 SmallVector<uint64_t, 32> RawMask;
5622 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5623 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5626 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5627 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5633 case X86ISD::VPPERM: {
5634 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5635 SDValue MaskNode = N->getOperand(2);
5636 SmallVector<uint64_t, 32> RawMask;
5637 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5638 DecodeVPPERMMask(RawMask, Mask);
5641 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5642 DecodeVPPERMMask(C, Mask);
5647 case X86ISD::VPERMV: {
5649 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5650 Ops.push_back(N->getOperand(1));
5651 SDValue MaskNode = N->getOperand(0);
5652 SmallVector<uint64_t, 32> RawMask;
5653 unsigned MaskEltSize = VT.getScalarSizeInBits();
5654 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5655 DecodeVPERMVMask(RawMask, Mask);
5658 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5659 DecodeVPERMVMask(C, MaskEltSize, Mask);
5664 case X86ISD::VPERMV3: {
5665 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5666 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5667 Ops.push_back(N->getOperand(0));
5668 Ops.push_back(N->getOperand(2));
5669 SDValue MaskNode = N->getOperand(1);
5670 unsigned MaskEltSize = VT.getScalarSizeInBits();
5671 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5672 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5677 case X86ISD::VPERMIV3: {
5678 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5679 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5680 Ops.push_back(N->getOperand(1));
5681 Ops.push_back(N->getOperand(2));
5682 SDValue MaskNode = N->getOperand(0);
5683 unsigned MaskEltSize = VT.getScalarSizeInBits();
5684 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5685 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5690 default: llvm_unreachable("unknown target shuffle node");
5693 // Empty mask indicates the decode failed.
5697 // Check if we're getting a shuffle mask with zero'd elements.
5698 if (!AllowSentinelZero)
5699 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5702 // If we have a fake unary shuffle, the shuffle mask is spread across two
5703 // inputs that are actually the same node. Re-map the mask to always point
5704 // into the first input.
5707 if (M >= (int)Mask.size())
5710 // If we didn't already add operands in the opcode-specific code, default to
5711 // adding 1 or 2 operands starting at 0.
5713 Ops.push_back(N->getOperand(0));
5714 if (!IsUnary || IsFakeUnary)
5715 Ops.push_back(N->getOperand(1));
5721 /// Check a target shuffle mask's inputs to see if we can set any values to
5722 /// SM_SentinelZero - this is for elements that are known to be zero
5723 /// (not just zeroable) from their inputs.
5724 /// Returns true if the target shuffle mask was decoded.
5725 static bool setTargetShuffleZeroElements(SDValue N,
5726 SmallVectorImpl<int> &Mask,
5727 SmallVectorImpl<SDValue> &Ops) {
5729 if (!isTargetShuffle(N.getOpcode()))
5732 MVT VT = N.getSimpleValueType();
5733 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5736 SDValue V1 = Ops[0];
5737 SDValue V2 = IsUnary ? V1 : Ops[1];
5739 V1 = peekThroughBitcasts(V1);
5740 V2 = peekThroughBitcasts(V2);
5742 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5743 "Illegal split of shuffle value type");
5744 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5746 // Extract known constant input data.
5747 APInt UndefSrcElts[2];
5748 SmallVector<APInt, 32> SrcEltBits[2];
5749 bool IsSrcConstant[2] = {
5750 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5751 SrcEltBits[0], true, false),
5752 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5753 SrcEltBits[1], true, false)};
5755 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5758 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5762 // Determine shuffle input and normalize the mask.
5763 unsigned SrcIdx = M / Size;
5764 SDValue V = M < Size ? V1 : V2;
5767 // We are referencing an UNDEF input.
5769 Mask[i] = SM_SentinelUndef;
5773 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5774 // TODO: We currently only set UNDEF for integer types - floats use the same
5775 // registers as vectors and many of the scalar folded loads rely on the
5776 // SCALAR_TO_VECTOR pattern.
5777 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5778 (Size % V.getValueType().getVectorNumElements()) == 0) {
5779 int Scale = Size / V.getValueType().getVectorNumElements();
5780 int Idx = M / Scale;
5781 if (Idx != 0 && !VT.isFloatingPoint())
5782 Mask[i] = SM_SentinelUndef;
5783 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5784 Mask[i] = SM_SentinelZero;
5788 // Attempt to extract from the source's constant bits.
5789 if (IsSrcConstant[SrcIdx]) {
5790 if (UndefSrcElts[SrcIdx][M])
5791 Mask[i] = SM_SentinelUndef;
5792 else if (SrcEltBits[SrcIdx][M] == 0)
5793 Mask[i] = SM_SentinelZero;
5797 assert(VT.getVectorNumElements() == Mask.size() &&
5798 "Different mask size from vector size!");
5802 // Attempt to decode ops that could be represented as a shuffle mask.
5803 // The decoded shuffle mask may contain a different number of elements to the
5804 // destination value type.
5805 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5806 SmallVectorImpl<SDValue> &Ops) {
5810 MVT VT = N.getSimpleValueType();
5811 unsigned NumElts = VT.getVectorNumElements();
5812 unsigned NumSizeInBits = VT.getSizeInBits();
5813 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5814 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5815 "Expected byte aligned value types");
5817 unsigned Opcode = N.getOpcode();
5820 case X86ISD::ANDNP: {
5821 // Attempt to decode as a per-byte mask.
5823 SmallVector<APInt, 32> EltBits;
5824 SDValue N0 = N.getOperand(0);
5825 SDValue N1 = N.getOperand(1);
5826 bool IsAndN = (X86ISD::ANDNP == Opcode);
5827 uint64_t ZeroMask = IsAndN ? 255 : 0;
5828 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5830 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5832 Mask.push_back(SM_SentinelUndef);
5835 uint64_t ByteBits = EltBits[i].getZExtValue();
5836 if (ByteBits != 0 && ByteBits != 255)
5838 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5840 Ops.push_back(IsAndN ? N1 : N0);
5843 case ISD::SCALAR_TO_VECTOR: {
5844 // Match against a scalar_to_vector of an extract from a similar vector.
5845 SDValue N0 = N.getOperand(0);
5846 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5847 N0.getOperand(0).getValueType() != VT ||
5848 !isa<ConstantSDNode>(N0.getOperand(1)) ||
5849 NumElts <= N0.getConstantOperandVal(1) ||
5850 !N->isOnlyUserOf(N0.getNode()))
5852 Ops.push_back(N0.getOperand(0));
5853 Mask.push_back(N0.getConstantOperandVal(1));
5854 Mask.append(NumElts - 1, SM_SentinelUndef);
5857 case X86ISD::PINSRB:
5858 case X86ISD::PINSRW: {
5859 SDValue InVec = N.getOperand(0);
5860 SDValue InScl = N.getOperand(1);
5861 uint64_t InIdx = N.getConstantOperandVal(2);
5862 assert(InIdx < NumElts && "Illegal insertion index");
5864 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5865 if (X86::isZeroNode(InScl)) {
5866 Ops.push_back(InVec);
5867 for (unsigned i = 0; i != NumElts; ++i)
5868 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5872 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5873 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5875 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5876 if (InScl.getOpcode() != ISD::AssertZext ||
5877 InScl.getOperand(0).getOpcode() != ExOp)
5880 SDValue ExVec = InScl.getOperand(0).getOperand(0);
5881 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5882 assert(ExIdx < NumElts && "Illegal extraction index");
5883 Ops.push_back(InVec);
5884 Ops.push_back(ExVec);
5885 for (unsigned i = 0; i != NumElts; ++i)
5886 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5890 case X86ISD::VSRLI: {
5891 uint64_t ShiftVal = N.getConstantOperandVal(1);
5892 // Out of range bit shifts are guaranteed to be zero.
5893 if (NumBitsPerElt <= ShiftVal) {
5894 Mask.append(NumElts, SM_SentinelZero);
5898 // We can only decode 'whole byte' bit shifts as shuffles.
5899 if ((ShiftVal % 8) != 0)
5902 uint64_t ByteShift = ShiftVal / 8;
5903 unsigned NumBytes = NumSizeInBits / 8;
5904 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5905 Ops.push_back(N.getOperand(0));
5907 // Clear mask to all zeros and insert the shifted byte indices.
5908 Mask.append(NumBytes, SM_SentinelZero);
5910 if (X86ISD::VSHLI == Opcode) {
5911 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5912 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5913 Mask[i + j] = i + j - ByteShift;
5915 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5916 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5917 Mask[i + j - ByteShift] = i + j;
5921 case ISD::ZERO_EXTEND_VECTOR_INREG:
5922 case X86ISD::VZEXT: {
5923 // TODO - add support for VPMOVZX with smaller input vector types.
5924 SDValue Src = N.getOperand(0);
5925 MVT SrcVT = Src.getSimpleValueType();
5926 if (NumSizeInBits != SrcVT.getSizeInBits())
5928 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5937 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
5938 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
5939 SmallVectorImpl<int> &Mask) {
5940 int MaskWidth = Mask.size();
5941 SmallVector<SDValue, 16> UsedInputs;
5942 for (int i = 0, e = Inputs.size(); i < e; ++i) {
5943 int lo = UsedInputs.size() * MaskWidth;
5944 int hi = lo + MaskWidth;
5945 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
5946 UsedInputs.push_back(Inputs[i]);
5953 Inputs = UsedInputs;
5956 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5957 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5958 /// remaining input indices in case we now have a unary shuffle and adjust the
5959 /// inputs accordingly.
5960 /// Returns true if the target shuffle mask was decoded.
5961 static bool resolveTargetShuffleInputs(SDValue Op,
5962 SmallVectorImpl<SDValue> &Inputs,
5963 SmallVectorImpl<int> &Mask) {
5964 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
5965 if (!getFauxShuffleMask(Op, Mask, Inputs))
5968 resolveTargetShuffleInputsAndMask(Inputs, Mask);
5972 /// Returns the scalar element that will make up the ith
5973 /// element of the result of the vector shuffle.
5974 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5977 return SDValue(); // Limit search depth.
5979 SDValue V = SDValue(N, 0);
5980 EVT VT = V.getValueType();
5981 unsigned Opcode = V.getOpcode();
5983 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5984 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5985 int Elt = SV->getMaskElt(Index);
5988 return DAG.getUNDEF(VT.getVectorElementType());
5990 unsigned NumElems = VT.getVectorNumElements();
5991 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5992 : SV->getOperand(1);
5993 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5996 // Recurse into target specific vector shuffles to find scalars.
5997 if (isTargetShuffle(Opcode)) {
5998 MVT ShufVT = V.getSimpleValueType();
5999 MVT ShufSVT = ShufVT.getVectorElementType();
6000 int NumElems = (int)ShufVT.getVectorNumElements();
6001 SmallVector<int, 16> ShuffleMask;
6002 SmallVector<SDValue, 16> ShuffleOps;
6005 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6008 int Elt = ShuffleMask[Index];
6009 if (Elt == SM_SentinelZero)
6010 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6011 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6012 if (Elt == SM_SentinelUndef)
6013 return DAG.getUNDEF(ShufSVT);
6015 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6016 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6017 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6021 // Actual nodes that may contain scalar elements
6022 if (Opcode == ISD::BITCAST) {
6023 V = V.getOperand(0);
6024 EVT SrcVT = V.getValueType();
6025 unsigned NumElems = VT.getVectorNumElements();
6027 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6031 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6032 return (Index == 0) ? V.getOperand(0)
6033 : DAG.getUNDEF(VT.getVectorElementType());
6035 if (V.getOpcode() == ISD::BUILD_VECTOR)
6036 return V.getOperand(Index);
6041 /// Custom lower build_vector of v16i8.
6042 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6043 unsigned NumNonZero, unsigned NumZero,
6045 const X86Subtarget &Subtarget) {
6046 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6053 // SSE4.1 - use PINSRB to insert each byte directly.
6054 if (Subtarget.hasSSE41()) {
6055 for (unsigned i = 0; i < 16; ++i) {
6056 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6058 // If the build vector contains zeros or our first insertion is not the
6059 // first index then insert into zero vector to break any register
6060 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6063 if (NumZero || 0 != i)
6064 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6066 assert(0 == i && "Expected insertion into zero-index");
6067 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6068 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6069 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6070 V = DAG.getBitcast(MVT::v16i8, V);
6074 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6075 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6082 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6083 for (unsigned i = 0; i < 16; ++i) {
6084 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6085 if (ThisIsNonZero && First) {
6087 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6089 V = DAG.getUNDEF(MVT::v8i16);
6094 // FIXME: Investigate extending to i32 instead of just i16.
6095 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6096 SDValue ThisElt, LastElt;
6097 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6098 if (LastIsNonZero) {
6100 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6102 if (ThisIsNonZero) {
6103 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6104 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6105 DAG.getConstant(8, dl, MVT::i8));
6107 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6113 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6114 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6115 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6116 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6117 V = DAG.getBitcast(MVT::v8i16, V);
6119 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6120 DAG.getIntPtrConstant(i / 2, dl));
6126 return DAG.getBitcast(MVT::v16i8, V);
6129 /// Custom lower build_vector of v8i16.
6130 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6131 unsigned NumNonZero, unsigned NumZero,
6133 const X86Subtarget &Subtarget) {
6134 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6140 for (unsigned i = 0; i < 8; ++i) {
6141 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6143 // If the build vector contains zeros or our first insertion is not the
6144 // first index then insert into zero vector to break any register
6145 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6148 if (NumZero || 0 != i)
6149 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6151 assert(0 == i && "Expected insertion into zero-index");
6152 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6153 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6154 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6155 V = DAG.getBitcast(MVT::v8i16, V);
6159 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6160 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6167 /// Custom lower build_vector of v4i32 or v4f32.
6168 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6169 const X86Subtarget &Subtarget) {
6170 // Find all zeroable elements.
6171 std::bitset<4> Zeroable;
6172 for (int i=0; i < 4; ++i) {
6173 SDValue Elt = Op->getOperand(i);
6174 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6176 assert(Zeroable.size() - Zeroable.count() > 1 &&
6177 "We expect at least two non-zero elements!");
6179 // We only know how to deal with build_vector nodes where elements are either
6180 // zeroable or extract_vector_elt with constant index.
6181 SDValue FirstNonZero;
6182 unsigned FirstNonZeroIdx;
6183 for (unsigned i=0; i < 4; ++i) {
6186 SDValue Elt = Op->getOperand(i);
6187 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6188 !isa<ConstantSDNode>(Elt.getOperand(1)))
6190 // Make sure that this node is extracting from a 128-bit vector.
6191 MVT VT = Elt.getOperand(0).getSimpleValueType();
6192 if (!VT.is128BitVector())
6194 if (!FirstNonZero.getNode()) {
6196 FirstNonZeroIdx = i;
6200 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6201 SDValue V1 = FirstNonZero.getOperand(0);
6202 MVT VT = V1.getSimpleValueType();
6204 // See if this build_vector can be lowered as a blend with zero.
6206 unsigned EltMaskIdx, EltIdx;
6208 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6209 if (Zeroable[EltIdx]) {
6210 // The zero vector will be on the right hand side.
6211 Mask[EltIdx] = EltIdx+4;
6215 Elt = Op->getOperand(EltIdx);
6216 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6217 EltMaskIdx = Elt.getConstantOperandVal(1);
6218 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6220 Mask[EltIdx] = EltIdx;
6224 // Let the shuffle legalizer deal with blend operations.
6225 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6226 if (V1.getSimpleValueType() != VT)
6227 V1 = DAG.getBitcast(VT, V1);
6228 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6231 // See if we can lower this build_vector to a INSERTPS.
6232 if (!Subtarget.hasSSE41())
6235 SDValue V2 = Elt.getOperand(0);
6236 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6239 bool CanFold = true;
6240 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6244 SDValue Current = Op->getOperand(i);
6245 SDValue SrcVector = Current->getOperand(0);
6248 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6254 assert(V1.getNode() && "Expected at least two non-zero elements!");
6255 if (V1.getSimpleValueType() != MVT::v4f32)
6256 V1 = DAG.getBitcast(MVT::v4f32, V1);
6257 if (V2.getSimpleValueType() != MVT::v4f32)
6258 V2 = DAG.getBitcast(MVT::v4f32, V2);
6260 // Ok, we can emit an INSERTPS instruction.
6261 unsigned ZMask = Zeroable.to_ulong();
6263 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6264 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6266 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6267 DAG.getIntPtrConstant(InsertPSMask, DL));
6268 return DAG.getBitcast(VT, Result);
6271 /// Return a vector logical shift node.
6272 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6273 SelectionDAG &DAG, const TargetLowering &TLI,
6275 assert(VT.is128BitVector() && "Unknown type for VShift");
6276 MVT ShVT = MVT::v16i8;
6277 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6278 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6279 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6280 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6281 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6282 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6285 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6286 SelectionDAG &DAG) {
6288 // Check if the scalar load can be widened into a vector load. And if
6289 // the address is "base + cst" see if the cst can be "absorbed" into
6290 // the shuffle mask.
6291 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6292 SDValue Ptr = LD->getBasePtr();
6293 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6295 EVT PVT = LD->getValueType(0);
6296 if (PVT != MVT::i32 && PVT != MVT::f32)
6301 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6302 FI = FINode->getIndex();
6304 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6305 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6306 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6307 Offset = Ptr.getConstantOperandVal(1);
6308 Ptr = Ptr.getOperand(0);
6313 // FIXME: 256-bit vector instructions don't require a strict alignment,
6314 // improve this code to support it better.
6315 unsigned RequiredAlign = VT.getSizeInBits()/8;
6316 SDValue Chain = LD->getChain();
6317 // Make sure the stack object alignment is at least 16 or 32.
6318 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6319 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6320 if (MFI.isFixedObjectIndex(FI)) {
6321 // Can't change the alignment. FIXME: It's possible to compute
6322 // the exact stack offset and reference FI + adjust offset instead.
6323 // If someone *really* cares about this. That's the way to implement it.
6326 MFI.setObjectAlignment(FI, RequiredAlign);
6330 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6331 // Ptr + (Offset & ~15).
6334 if ((Offset % RequiredAlign) & 3)
6336 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6339 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6340 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6343 int EltNo = (Offset - StartOffset) >> 2;
6344 unsigned NumElems = VT.getVectorNumElements();
6346 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6347 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6348 LD->getPointerInfo().getWithOffset(StartOffset));
6350 SmallVector<int, 8> Mask(NumElems, EltNo);
6352 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6358 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6359 /// elements can be replaced by a single large load which has the same value as
6360 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6362 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6363 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6364 const SDLoc &DL, SelectionDAG &DAG,
6365 bool isAfterLegalize) {
6366 unsigned NumElems = Elts.size();
6368 int LastLoadedElt = -1;
6369 SmallBitVector LoadMask(NumElems, false);
6370 SmallBitVector ZeroMask(NumElems, false);
6371 SmallBitVector UndefMask(NumElems, false);
6373 // For each element in the initializer, see if we've found a load, zero or an
6375 for (unsigned i = 0; i < NumElems; ++i) {
6376 SDValue Elt = peekThroughBitcasts(Elts[i]);
6381 UndefMask[i] = true;
6382 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6384 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6387 // Each loaded element must be the correct fractional portion of the
6388 // requested vector load.
6389 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6394 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6395 "Incomplete element masks");
6397 // Handle Special Cases - all undef or undef/zero.
6398 if (UndefMask.count() == NumElems)
6399 return DAG.getUNDEF(VT);
6401 // FIXME: Should we return this as a BUILD_VECTOR instead?
6402 if ((ZeroMask | UndefMask).count() == NumElems)
6403 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6404 : DAG.getConstantFP(0.0, DL, VT);
6406 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6407 int FirstLoadedElt = LoadMask.find_first();
6408 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6409 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6410 EVT LDBaseVT = EltBase.getValueType();
6412 // Consecutive loads can contain UNDEFS but not ZERO elements.
6413 // Consecutive loads with UNDEFs and ZEROs elements require a
6414 // an additional shuffle stage to clear the ZERO elements.
6415 bool IsConsecutiveLoad = true;
6416 bool IsConsecutiveLoadWithZeros = true;
6417 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6419 SDValue Elt = peekThroughBitcasts(Elts[i]);
6420 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6421 if (!DAG.areNonVolatileConsecutiveLoads(
6422 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6423 i - FirstLoadedElt)) {
6424 IsConsecutiveLoad = false;
6425 IsConsecutiveLoadWithZeros = false;
6428 } else if (ZeroMask[i]) {
6429 IsConsecutiveLoad = false;
6433 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6434 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6435 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6436 "Cannot merge volatile loads.");
6438 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6439 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6441 if (LDBase->hasAnyUseOfValue(1)) {
6443 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6444 SDValue(NewLd.getNode(), 1));
6445 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6446 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6447 SDValue(NewLd.getNode(), 1));
6453 // LOAD - all consecutive load/undefs (must start/end with a load).
6454 // If we have found an entire vector of loads and undefs, then return a large
6455 // load of the entire vector width starting at the base pointer.
6456 // If the vector contains zeros, then attempt to shuffle those elements.
6457 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6458 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6459 assert(LDBase && "Did not find base load for merging consecutive loads");
6460 EVT EltVT = LDBase->getValueType(0);
6461 // Ensure that the input vector size for the merged loads matches the
6462 // cumulative size of the input elements.
6463 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6466 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6469 if (IsConsecutiveLoad)
6470 return CreateLoad(VT, LDBase);
6472 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6473 // vector and a zero vector to clear out the zero elements.
6474 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6475 SmallVector<int, 4> ClearMask(NumElems, -1);
6476 for (unsigned i = 0; i < NumElems; ++i) {
6478 ClearMask[i] = i + NumElems;
6479 else if (LoadMask[i])
6482 SDValue V = CreateLoad(VT, LDBase);
6483 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6484 : DAG.getConstantFP(0.0, DL, VT);
6485 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6490 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6492 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6493 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6494 (LoadSize == 32 || LoadSize == 64) &&
6495 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6496 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6497 : MVT::getIntegerVT(LoadSize);
6498 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6499 if (TLI.isTypeLegal(VecVT)) {
6500 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6501 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6503 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6504 LDBase->getPointerInfo(),
6505 LDBase->getAlignment(),
6506 false/*isVolatile*/, true/*ReadMem*/,
6509 // Make sure the newly-created LOAD is in the same position as LDBase in
6510 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6511 // and update uses of LDBase's output chain to use the TokenFactor.
6512 if (LDBase->hasAnyUseOfValue(1)) {
6514 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6515 SDValue(ResNode.getNode(), 1));
6516 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6517 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6518 SDValue(ResNode.getNode(), 1));
6521 return DAG.getBitcast(VT, ResNode);
6528 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6529 unsigned SplatBitSize, LLVMContext &C) {
6530 unsigned ScalarSize = VT.getScalarSizeInBits();
6531 unsigned NumElm = SplatBitSize / ScalarSize;
6533 SmallVector<Constant *, 32> ConstantVec;
6534 for (unsigned i = 0; i < NumElm; i++) {
6535 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6537 if (VT.isFloatingPoint()) {
6538 assert((ScalarSize == 32 || ScalarSize == 64) &&
6539 "Unsupported floating point scalar size");
6540 if (ScalarSize == 32)
6541 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6543 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6545 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6546 ConstantVec.push_back(Const);
6548 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6551 static bool isUseOfShuffle(SDNode *N) {
6552 for (auto *U : N->uses()) {
6553 if (isTargetShuffle(U->getOpcode()))
6555 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6556 return isUseOfShuffle(U);
6561 /// Attempt to use the vbroadcast instruction to generate a splat value
6562 /// from a splat BUILD_VECTOR which uses:
6563 /// a. A single scalar load, or a constant.
6564 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6566 /// The VBROADCAST node is returned when a pattern is found,
6567 /// or SDValue() otherwise.
6568 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6569 const X86Subtarget &Subtarget,
6570 SelectionDAG &DAG) {
6571 // VBROADCAST requires AVX.
6572 // TODO: Splats could be generated for non-AVX CPUs using SSE
6573 // instructions, but there's less potential gain for only 128-bit vectors.
6574 if (!Subtarget.hasAVX())
6577 MVT VT = BVOp->getSimpleValueType(0);
6580 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6581 "Unsupported vector type for broadcast.");
6583 BitVector UndefElements;
6584 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6586 // We need a splat of a single value to use broadcast, and it doesn't
6587 // make any sense if the value is only in one element of the vector.
6588 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6589 APInt SplatValue, Undef;
6590 unsigned SplatBitSize;
6592 // Check if this is a repeated constant pattern suitable for broadcasting.
6593 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6594 SplatBitSize > VT.getScalarSizeInBits() &&
6595 SplatBitSize < VT.getSizeInBits()) {
6596 // Avoid replacing with broadcast when it's a use of a shuffle
6597 // instruction to preserve the present custom lowering of shuffles.
6598 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6600 // replace BUILD_VECTOR with broadcast of the repeated constants.
6601 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6602 LLVMContext *Ctx = DAG.getContext();
6603 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6604 if (Subtarget.hasAVX()) {
6605 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6606 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6607 // Splatted value can fit in one INTEGER constant in constant pool.
6608 // Load the constant and broadcast it.
6609 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6610 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6611 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6612 SDValue CP = DAG.getConstantPool(C, PVT);
6613 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6615 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6617 CVT, dl, DAG.getEntryNode(), CP,
6618 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6620 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6621 MVT::getVectorVT(CVT, Repeat), Ld);
6622 return DAG.getBitcast(VT, Brdcst);
6623 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6624 // Splatted value can fit in one FLOAT constant in constant pool.
6625 // Load the constant and broadcast it.
6626 // AVX have support for 32 and 64 bit broadcast for floats only.
6627 // No 64bit integer in 32bit subtarget.
6628 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6629 Constant *C = SplatBitSize == 32
6630 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6631 SplatValue.bitsToFloat())
6632 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6633 SplatValue.bitsToDouble());
6634 SDValue CP = DAG.getConstantPool(C, PVT);
6635 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6637 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6639 CVT, dl, DAG.getEntryNode(), CP,
6640 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6642 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6643 MVT::getVectorVT(CVT, Repeat), Ld);
6644 return DAG.getBitcast(VT, Brdcst);
6645 } else if (SplatBitSize > 64) {
6646 // Load the vector of constants and broadcast it.
6647 MVT CVT = VT.getScalarType();
6648 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6650 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6651 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6652 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6654 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6655 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6657 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6658 return DAG.getBitcast(VT, Brdcst);
6665 bool ConstSplatVal =
6666 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6668 // Make sure that all of the users of a non-constant load are from the
6669 // BUILD_VECTOR node.
6670 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6673 unsigned ScalarSize = Ld.getValueSizeInBits();
6674 bool IsGE256 = (VT.getSizeInBits() >= 256);
6676 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6677 // instruction to save 8 or more bytes of constant pool data.
6678 // TODO: If multiple splats are generated to load the same constant,
6679 // it may be detrimental to overall size. There needs to be a way to detect
6680 // that condition to know if this is truly a size win.
6681 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6683 // Handle broadcasting a single constant scalar from the constant pool
6685 // On Sandybridge (no AVX2), it is still better to load a constant vector
6686 // from the constant pool and not to broadcast it from a scalar.
6687 // But override that restriction when optimizing for size.
6688 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6689 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6690 EVT CVT = Ld.getValueType();
6691 assert(!CVT.isVector() && "Must not broadcast a vector type");
6693 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6694 // For size optimization, also splat v2f64 and v2i64, and for size opt
6695 // with AVX2, also splat i8 and i16.
6696 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6697 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6698 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6699 const Constant *C = nullptr;
6700 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6701 C = CI->getConstantIntValue();
6702 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6703 C = CF->getConstantFPValue();
6705 assert(C && "Invalid constant type");
6707 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6709 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6710 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6712 CVT, dl, DAG.getEntryNode(), CP,
6713 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6716 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6720 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6722 // Handle AVX2 in-register broadcasts.
6723 if (!IsLoad && Subtarget.hasInt256() &&
6724 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6725 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6727 // The scalar source must be a normal load.
6731 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6732 (Subtarget.hasVLX() && ScalarSize == 64))
6733 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6735 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6736 // double since there is no vbroadcastsd xmm
6737 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6738 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6739 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6742 // Unsupported broadcast.
6746 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6747 /// underlying vector and index.
6749 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6751 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6753 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6754 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6757 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6759 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6761 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6762 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6765 // In this case the vector is the extract_subvector expression and the index
6766 // is 2, as specified by the shuffle.
6767 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6768 SDValue ShuffleVec = SVOp->getOperand(0);
6769 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6770 assert(ShuffleVecVT.getVectorElementType() ==
6771 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6773 int ShuffleIdx = SVOp->getMaskElt(Idx);
6774 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6775 ExtractedFromVec = ShuffleVec;
6781 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6782 MVT VT = Op.getSimpleValueType();
6784 // Skip if insert_vec_elt is not supported.
6785 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6786 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6790 unsigned NumElems = Op.getNumOperands();
6794 SmallVector<unsigned, 4> InsertIndices;
6795 SmallVector<int, 8> Mask(NumElems, -1);
6797 for (unsigned i = 0; i != NumElems; ++i) {
6798 unsigned Opc = Op.getOperand(i).getOpcode();
6800 if (Opc == ISD::UNDEF)
6803 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6804 // Quit if more than 1 elements need inserting.
6805 if (InsertIndices.size() > 1)
6808 InsertIndices.push_back(i);
6812 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6813 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6815 // Quit if non-constant index.
6816 if (!isa<ConstantSDNode>(ExtIdx))
6818 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6820 // Quit if extracted from vector of different type.
6821 if (ExtractedFromVec.getValueType() != VT)
6824 if (!VecIn1.getNode())
6825 VecIn1 = ExtractedFromVec;
6826 else if (VecIn1 != ExtractedFromVec) {
6827 if (!VecIn2.getNode())
6828 VecIn2 = ExtractedFromVec;
6829 else if (VecIn2 != ExtractedFromVec)
6830 // Quit if more than 2 vectors to shuffle
6834 if (ExtractedFromVec == VecIn1)
6836 else if (ExtractedFromVec == VecIn2)
6837 Mask[i] = Idx + NumElems;
6840 if (!VecIn1.getNode())
6843 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6844 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6846 for (unsigned Idx : InsertIndices)
6847 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6848 DAG.getIntPtrConstant(Idx, DL));
6853 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6854 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6855 Op.getScalarValueSizeInBits() == 1 &&
6856 "Can not convert non-constant vector");
6857 uint64_t Immediate = 0;
6858 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6859 SDValue In = Op.getOperand(idx);
6861 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6864 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6865 return DAG.getConstant(Immediate, dl, VT);
6867 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6869 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6871 MVT VT = Op.getSimpleValueType();
6872 assert((VT.getVectorElementType() == MVT::i1) &&
6873 "Unexpected type in LowerBUILD_VECTORvXi1!");
6876 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6877 return DAG.getTargetConstant(0, dl, VT);
6879 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6880 return DAG.getTargetConstant(1, dl, VT);
6882 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6883 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6884 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6885 return DAG.getBitcast(VT, Imm);
6886 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6887 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6888 DAG.getIntPtrConstant(0, dl));
6891 // Vector has one or more non-const elements
6892 uint64_t Immediate = 0;
6893 SmallVector<unsigned, 16> NonConstIdx;
6894 bool IsSplat = true;
6895 bool HasConstElts = false;
6897 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6898 SDValue In = Op.getOperand(idx);
6901 if (!isa<ConstantSDNode>(In))
6902 NonConstIdx.push_back(idx);
6904 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6905 HasConstElts = true;
6909 else if (In != Op.getOperand(SplatIdx))
6913 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6915 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6916 DAG.getConstant(1, dl, VT),
6917 DAG.getConstant(0, dl, VT));
6919 // insert elements one by one
6923 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6924 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6926 else if (HasConstElts)
6927 Imm = DAG.getConstant(0, dl, VT);
6929 Imm = DAG.getUNDEF(VT);
6930 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6931 DstVec = DAG.getBitcast(VT, Imm);
6933 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6934 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6935 DAG.getIntPtrConstant(0, dl));
6938 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6939 unsigned InsertIdx = NonConstIdx[i];
6940 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6941 Op.getOperand(InsertIdx),
6942 DAG.getIntPtrConstant(InsertIdx, dl));
6947 /// \brief Return true if \p N implements a horizontal binop and return the
6948 /// operands for the horizontal binop into V0 and V1.
6950 /// This is a helper function of LowerToHorizontalOp().
6951 /// This function checks that the build_vector \p N in input implements a
6952 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6953 /// operation to match.
6954 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6955 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6956 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6959 /// This function only analyzes elements of \p N whose indices are
6960 /// in range [BaseIdx, LastIdx).
6961 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6963 unsigned BaseIdx, unsigned LastIdx,
6964 SDValue &V0, SDValue &V1) {
6965 EVT VT = N->getValueType(0);
6967 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6968 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6969 "Invalid Vector in input!");
6971 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6972 bool CanFold = true;
6973 unsigned ExpectedVExtractIdx = BaseIdx;
6974 unsigned NumElts = LastIdx - BaseIdx;
6975 V0 = DAG.getUNDEF(VT);
6976 V1 = DAG.getUNDEF(VT);
6978 // Check if N implements a horizontal binop.
6979 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6980 SDValue Op = N->getOperand(i + BaseIdx);
6983 if (Op->isUndef()) {
6984 // Update the expected vector extract index.
6985 if (i * 2 == NumElts)
6986 ExpectedVExtractIdx = BaseIdx;
6987 ExpectedVExtractIdx += 2;
6991 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6996 SDValue Op0 = Op.getOperand(0);
6997 SDValue Op1 = Op.getOperand(1);
6999 // Try to match the following pattern:
7000 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7001 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7002 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7003 Op0.getOperand(0) == Op1.getOperand(0) &&
7004 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7005 isa<ConstantSDNode>(Op1.getOperand(1)));
7009 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7010 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7012 if (i * 2 < NumElts) {
7014 V0 = Op0.getOperand(0);
7015 if (V0.getValueType() != VT)
7020 V1 = Op0.getOperand(0);
7021 if (V1.getValueType() != VT)
7024 if (i * 2 == NumElts)
7025 ExpectedVExtractIdx = BaseIdx;
7028 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7029 if (I0 == ExpectedVExtractIdx)
7030 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7031 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7032 // Try to match the following dag sequence:
7033 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7034 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7038 ExpectedVExtractIdx += 2;
7044 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7045 /// a concat_vector.
7047 /// This is a helper function of LowerToHorizontalOp().
7048 /// This function expects two 256-bit vectors called V0 and V1.
7049 /// At first, each vector is split into two separate 128-bit vectors.
7050 /// Then, the resulting 128-bit vectors are used to implement two
7051 /// horizontal binary operations.
7053 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7055 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7056 /// the two new horizontal binop.
7057 /// When Mode is set, the first horizontal binop dag node would take as input
7058 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7059 /// horizontal binop dag node would take as input the lower 128-bit of V1
7060 /// and the upper 128-bit of V1.
7062 /// HADD V0_LO, V0_HI
7063 /// HADD V1_LO, V1_HI
7065 /// Otherwise, the first horizontal binop dag node takes as input the lower
7066 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7067 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7069 /// HADD V0_LO, V1_LO
7070 /// HADD V0_HI, V1_HI
7072 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7073 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7074 /// the upper 128-bits of the result.
7075 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7076 const SDLoc &DL, SelectionDAG &DAG,
7077 unsigned X86Opcode, bool Mode,
7078 bool isUndefLO, bool isUndefHI) {
7079 MVT VT = V0.getSimpleValueType();
7080 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7081 "Invalid nodes in input!");
7083 unsigned NumElts = VT.getVectorNumElements();
7084 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7085 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7086 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7087 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7088 MVT NewVT = V0_LO.getSimpleValueType();
7090 SDValue LO = DAG.getUNDEF(NewVT);
7091 SDValue HI = DAG.getUNDEF(NewVT);
7094 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7095 if (!isUndefLO && !V0->isUndef())
7096 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7097 if (!isUndefHI && !V1->isUndef())
7098 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7100 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7101 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7102 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7104 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7105 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7108 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7111 /// Returns true iff \p BV builds a vector with the result equivalent to
7112 /// the result of ADDSUB operation.
7113 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7114 /// are written to the parameters \p Opnd0 and \p Opnd1.
7115 static bool isAddSub(const BuildVectorSDNode *BV,
7116 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7117 SDValue &Opnd0, SDValue &Opnd1) {
7119 MVT VT = BV->getSimpleValueType(0);
7120 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7121 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7122 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7125 unsigned NumElts = VT.getVectorNumElements();
7126 SDValue InVec0 = DAG.getUNDEF(VT);
7127 SDValue InVec1 = DAG.getUNDEF(VT);
7129 // Odd-numbered elements in the input build vector are obtained from
7130 // adding two integer/float elements.
7131 // Even-numbered elements in the input build vector are obtained from
7132 // subtracting two integer/float elements.
7133 unsigned ExpectedOpcode = ISD::FSUB;
7134 unsigned NextExpectedOpcode = ISD::FADD;
7135 bool AddFound = false;
7136 bool SubFound = false;
7138 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7139 SDValue Op = BV->getOperand(i);
7141 // Skip 'undef' values.
7142 unsigned Opcode = Op.getOpcode();
7143 if (Opcode == ISD::UNDEF) {
7144 std::swap(ExpectedOpcode, NextExpectedOpcode);
7148 // Early exit if we found an unexpected opcode.
7149 if (Opcode != ExpectedOpcode)
7152 SDValue Op0 = Op.getOperand(0);
7153 SDValue Op1 = Op.getOperand(1);
7155 // Try to match the following pattern:
7156 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7157 // Early exit if we cannot match that sequence.
7158 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7159 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7160 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7161 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7162 Op0.getOperand(1) != Op1.getOperand(1))
7165 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7169 // We found a valid add/sub node. Update the information accordingly.
7175 // Update InVec0 and InVec1.
7176 if (InVec0.isUndef()) {
7177 InVec0 = Op0.getOperand(0);
7178 if (InVec0.getSimpleValueType() != VT)
7181 if (InVec1.isUndef()) {
7182 InVec1 = Op1.getOperand(0);
7183 if (InVec1.getSimpleValueType() != VT)
7187 // Make sure that operands in input to each add/sub node always
7188 // come from a same pair of vectors.
7189 if (InVec0 != Op0.getOperand(0)) {
7190 if (ExpectedOpcode == ISD::FSUB)
7193 // FADD is commutable. Try to commute the operands
7194 // and then test again.
7195 std::swap(Op0, Op1);
7196 if (InVec0 != Op0.getOperand(0))
7200 if (InVec1 != Op1.getOperand(0))
7203 // Update the pair of expected opcodes.
7204 std::swap(ExpectedOpcode, NextExpectedOpcode);
7207 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7208 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7216 /// Returns true if is possible to fold MUL and an idiom that has already been
7217 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7218 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7219 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7221 /// Prior to calling this function it should be known that there is some
7222 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7223 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7224 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7225 /// of \p Opnd0 uses is expected to be equal to 2.
7226 /// For example, this function may be called for the following IR:
7227 /// %AB = fmul fast <2 x double> %A, %B
7228 /// %Sub = fsub fast <2 x double> %AB, %C
7229 /// %Add = fadd fast <2 x double> %AB, %C
7230 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7231 /// <2 x i32> <i32 0, i32 3>
7232 /// There is a def for %Addsub here, which potentially can be replaced by
7233 /// X86ISD::ADDSUB operation:
7234 /// %Addsub = X86ISD::ADDSUB %AB, %C
7235 /// and such ADDSUB can further be replaced with FMADDSUB:
7236 /// %Addsub = FMADDSUB %A, %B, %C.
7238 /// The main reason why this method is called before the replacement of the
7239 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7240 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7242 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7243 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7244 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7245 !Subtarget.hasAnyFMA())
7248 // FIXME: These checks must match the similar ones in
7249 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7250 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7251 // or MUL + ADDSUB to FMADDSUB.
7252 const TargetOptions &Options = DAG.getTarget().Options;
7254 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7259 Opnd1 = Opnd0.getOperand(1);
7260 Opnd0 = Opnd0.getOperand(0);
7265 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7266 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7267 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7268 const X86Subtarget &Subtarget,
7269 SelectionDAG &DAG) {
7270 SDValue Opnd0, Opnd1;
7271 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7274 MVT VT = BV->getSimpleValueType(0);
7277 // Try to generate X86ISD::FMADDSUB node here.
7279 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7280 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7282 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7283 // the ADDSUB idiom has been successfully recognized. There are no known
7284 // X86 targets with 512-bit ADDSUB instructions!
7285 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7287 if (VT.is512BitVector())
7290 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7293 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7294 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7295 const X86Subtarget &Subtarget,
7296 SelectionDAG &DAG) {
7297 MVT VT = BV->getSimpleValueType(0);
7298 unsigned NumElts = VT.getVectorNumElements();
7299 unsigned NumUndefsLO = 0;
7300 unsigned NumUndefsHI = 0;
7301 unsigned Half = NumElts/2;
7303 // Count the number of UNDEF operands in the build_vector in input.
7304 for (unsigned i = 0, e = Half; i != e; ++i)
7305 if (BV->getOperand(i)->isUndef())
7308 for (unsigned i = Half, e = NumElts; i != e; ++i)
7309 if (BV->getOperand(i)->isUndef())
7312 // Early exit if this is either a build_vector of all UNDEFs or all the
7313 // operands but one are UNDEF.
7314 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7318 SDValue InVec0, InVec1;
7319 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7320 // Try to match an SSE3 float HADD/HSUB.
7321 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7322 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7324 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7325 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7326 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7327 // Try to match an SSSE3 integer HADD/HSUB.
7328 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7329 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7331 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7332 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7335 if (!Subtarget.hasAVX())
7338 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7339 // Try to match an AVX horizontal add/sub of packed single/double
7340 // precision floating point values from 256-bit vectors.
7341 SDValue InVec2, InVec3;
7342 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7343 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7344 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7345 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7346 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7348 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7349 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7350 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7351 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7352 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7353 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7354 // Try to match an AVX2 horizontal add/sub of signed integers.
7355 SDValue InVec2, InVec3;
7357 bool CanFold = true;
7359 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7360 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7361 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7362 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7363 X86Opcode = X86ISD::HADD;
7364 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7365 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7366 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7367 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7368 X86Opcode = X86ISD::HSUB;
7373 // Fold this build_vector into a single horizontal add/sub.
7374 // Do this only if the target has AVX2.
7375 if (Subtarget.hasAVX2())
7376 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7378 // Do not try to expand this build_vector into a pair of horizontal
7379 // add/sub if we can emit a pair of scalar add/sub.
7380 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7383 // Convert this build_vector into a pair of horizontal binop followed by
7385 bool isUndefLO = NumUndefsLO == Half;
7386 bool isUndefHI = NumUndefsHI == Half;
7387 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7388 isUndefLO, isUndefHI);
7392 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7393 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7395 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7396 X86Opcode = X86ISD::HADD;
7397 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7398 X86Opcode = X86ISD::HSUB;
7399 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7400 X86Opcode = X86ISD::FHADD;
7401 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7402 X86Opcode = X86ISD::FHSUB;
7406 // Don't try to expand this build_vector into a pair of horizontal add/sub
7407 // if we can simply emit a pair of scalar add/sub.
7408 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7411 // Convert this build_vector into two horizontal add/sub followed by
7413 bool isUndefLO = NumUndefsLO == Half;
7414 bool isUndefHI = NumUndefsHI == Half;
7415 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7416 isUndefLO, isUndefHI);
7422 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7423 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7424 /// just apply the bit to the vectors.
7425 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7426 /// from this, but enough scalar bit operations are created from the later
7427 /// legalization + scalarization stages to need basic support.
7428 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7429 SelectionDAG &DAG) {
7431 MVT VT = Op->getSimpleValueType(0);
7432 unsigned NumElems = VT.getVectorNumElements();
7433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7435 // Check that all elements have the same opcode.
7436 // TODO: Should we allow UNDEFS and if so how many?
7437 unsigned Opcode = Op->getOperand(0).getOpcode();
7438 for (unsigned i = 1; i < NumElems; ++i)
7439 if (Opcode != Op->getOperand(i).getOpcode())
7442 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7449 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7454 SmallVector<SDValue, 4> LHSElts, RHSElts;
7455 for (SDValue Elt : Op->ops()) {
7456 SDValue LHS = Elt.getOperand(0);
7457 SDValue RHS = Elt.getOperand(1);
7459 // We expect the canonicalized RHS operand to be the constant.
7460 if (!isa<ConstantSDNode>(RHS))
7462 LHSElts.push_back(LHS);
7463 RHSElts.push_back(RHS);
7466 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7467 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7468 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7471 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7472 /// functionality to do this, so it's all zeros, all ones, or some derivation
7473 /// that is cheap to calculate.
7474 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7475 const X86Subtarget &Subtarget) {
7477 MVT VT = Op.getSimpleValueType();
7479 // Vectors containing all zeros can be matched by pxor and xorps.
7480 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7481 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7482 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7483 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7486 return getZeroVector(VT, Subtarget, DAG, DL);
7489 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7490 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7491 // vpcmpeqd on 256-bit vectors.
7492 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7493 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7494 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7497 return getOnesVector(VT, DAG, DL);
7504 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7507 MVT VT = Op.getSimpleValueType();
7508 MVT ExtVT = VT.getVectorElementType();
7509 unsigned NumElems = Op.getNumOperands();
7511 // Generate vectors for predicate vectors.
7512 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7513 return LowerBUILD_VECTORvXi1(Op, DAG);
7515 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7516 return VectorConstant;
7518 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7519 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7521 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7522 return HorizontalOp;
7523 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7525 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7528 unsigned EVTBits = ExtVT.getSizeInBits();
7530 unsigned NumZero = 0;
7531 unsigned NumNonZero = 0;
7532 uint64_t NonZeros = 0;
7533 bool IsAllConstants = true;
7534 SmallSet<SDValue, 8> Values;
7535 for (unsigned i = 0; i < NumElems; ++i) {
7536 SDValue Elt = Op.getOperand(i);
7540 if (Elt.getOpcode() != ISD::Constant &&
7541 Elt.getOpcode() != ISD::ConstantFP)
7542 IsAllConstants = false;
7543 if (X86::isZeroNode(Elt))
7546 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7547 NonZeros |= ((uint64_t)1 << i);
7552 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7553 if (NumNonZero == 0)
7554 return DAG.getUNDEF(VT);
7556 // Special case for single non-zero, non-undef, element.
7557 if (NumNonZero == 1) {
7558 unsigned Idx = countTrailingZeros(NonZeros);
7559 SDValue Item = Op.getOperand(Idx);
7561 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7562 // the value are obviously zero, truncate the value to i32 and do the
7563 // insertion that way. Only do this if the value is non-constant or if the
7564 // value is a constant being inserted into element 0. It is cheaper to do
7565 // a constant pool load than it is to do a movd + shuffle.
7566 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7567 (!IsAllConstants || Idx == 0)) {
7568 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7570 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7571 MVT VecVT = MVT::v4i32;
7573 // Truncate the value (which may itself be a constant) to i32, and
7574 // convert it to a vector with movd (S2V+shuffle to zero extend).
7575 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7576 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7577 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7578 Item, Idx * 2, true, Subtarget, DAG));
7582 // If we have a constant or non-constant insertion into the low element of
7583 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7584 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7585 // depending on what the source datatype is.
7588 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7590 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7591 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7592 assert((VT.is128BitVector() || VT.is256BitVector() ||
7593 VT.is512BitVector()) &&
7594 "Expected an SSE value type!");
7595 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7596 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7597 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7600 // We can't directly insert an i8 or i16 into a vector, so zero extend
7602 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7603 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7604 if (VT.getSizeInBits() >= 256) {
7605 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7606 if (Subtarget.hasAVX()) {
7607 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7608 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7610 // Without AVX, we need to extend to a 128-bit vector and then
7611 // insert into the 256-bit vector.
7612 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7613 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7614 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7617 assert(VT.is128BitVector() && "Expected an SSE value type!");
7618 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7619 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7621 return DAG.getBitcast(VT, Item);
7625 // Is it a vector logical left shift?
7626 if (NumElems == 2 && Idx == 1 &&
7627 X86::isZeroNode(Op.getOperand(0)) &&
7628 !X86::isZeroNode(Op.getOperand(1))) {
7629 unsigned NumBits = VT.getSizeInBits();
7630 return getVShift(true, VT,
7631 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7632 VT, Op.getOperand(1)),
7633 NumBits/2, DAG, *this, dl);
7636 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7639 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7640 // is a non-constant being inserted into an element other than the low one,
7641 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7642 // movd/movss) to move this into the low element, then shuffle it into
7644 if (EVTBits == 32) {
7645 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7646 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7650 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7651 if (Values.size() == 1) {
7652 if (EVTBits == 32) {
7653 // Instead of a shuffle like this:
7654 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7655 // Check if it's possible to issue this instead.
7656 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7657 unsigned Idx = countTrailingZeros(NonZeros);
7658 SDValue Item = Op.getOperand(Idx);
7659 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7660 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7665 // A vector full of immediates; various special cases are already
7666 // handled, so this is best done with a single constant-pool load.
7670 // See if we can use a vector load to get all of the elements.
7671 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7672 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7673 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7677 // For AVX-length vectors, build the individual 128-bit pieces and use
7678 // shuffles to put them in place.
7679 if (VT.is256BitVector() || VT.is512BitVector()) {
7680 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7682 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7684 // Build both the lower and upper subvector.
7686 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7687 SDValue Upper = DAG.getBuildVector(
7688 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7690 // Recreate the wider vector with the lower and upper part.
7691 if (VT.is256BitVector())
7692 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7693 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7696 // Let legalizer expand 2-wide build_vectors.
7697 if (EVTBits == 64) {
7698 if (NumNonZero == 1) {
7699 // One half is zero or undef.
7700 unsigned Idx = countTrailingZeros(NonZeros);
7701 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7702 Op.getOperand(Idx));
7703 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7708 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7709 if (EVTBits == 8 && NumElems == 16)
7710 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7714 if (EVTBits == 16 && NumElems == 8)
7715 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7719 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7720 if (EVTBits == 32 && NumElems == 4)
7721 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7724 // If element VT is == 32 bits, turn it into a number of shuffles.
7725 if (NumElems == 4 && NumZero > 0) {
7726 SmallVector<SDValue, 8> Ops(NumElems);
7727 for (unsigned i = 0; i < 4; ++i) {
7728 bool isZero = !(NonZeros & (1ULL << i));
7730 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7732 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7735 for (unsigned i = 0; i < 2; ++i) {
7736 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7739 Ops[i] = Ops[i*2]; // Must be a zero vector.
7742 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7745 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7748 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7753 bool Reverse1 = (NonZeros & 0x3) == 2;
7754 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7758 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7759 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7761 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7764 if (Values.size() > 1 && VT.is128BitVector()) {
7765 // Check for a build vector from mostly shuffle plus few inserting.
7766 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7769 // For SSE 4.1, use insertps to put the high elements into the low element.
7770 if (Subtarget.hasSSE41()) {
7772 if (!Op.getOperand(0).isUndef())
7773 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7775 Result = DAG.getUNDEF(VT);
7777 for (unsigned i = 1; i < NumElems; ++i) {
7778 if (Op.getOperand(i).isUndef()) continue;
7779 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7780 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7785 // Otherwise, expand into a number of unpckl*, start by extending each of
7786 // our (non-undef) elements to the full vector width with the element in the
7787 // bottom slot of the vector (which generates no code for SSE).
7788 SmallVector<SDValue, 8> Ops(NumElems);
7789 for (unsigned i = 0; i < NumElems; ++i) {
7790 if (!Op.getOperand(i).isUndef())
7791 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7793 Ops[i] = DAG.getUNDEF(VT);
7796 // Next, we iteratively mix elements, e.g. for v4f32:
7797 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7798 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7799 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7800 unsigned EltStride = NumElems >> 1;
7801 while (EltStride != 0) {
7802 for (unsigned i = 0; i < EltStride; ++i) {
7803 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7804 // then it is safe to just drop this shuffle: V[i] is already in the
7805 // right place, the one element (since it's the first round) being
7806 // inserted as undef can be dropped. This isn't safe for successive
7807 // rounds because they will permute elements within both vectors.
7808 if (Ops[i+EltStride].isUndef() &&
7809 EltStride == NumElems/2)
7812 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7821 // 256-bit AVX can use the vinsertf128 instruction
7822 // to create 256-bit vectors from two other 128-bit ones.
7823 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7825 MVT ResVT = Op.getSimpleValueType();
7827 assert((ResVT.is256BitVector() ||
7828 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7830 SDValue V1 = Op.getOperand(0);
7831 SDValue V2 = Op.getOperand(1);
7832 unsigned NumElems = ResVT.getVectorNumElements();
7833 if (ResVT.is256BitVector())
7834 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7836 if (Op.getNumOperands() == 4) {
7837 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7838 ResVT.getVectorNumElements()/2);
7839 SDValue V3 = Op.getOperand(2);
7840 SDValue V4 = Op.getOperand(3);
7841 return concat256BitVectors(
7842 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7843 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7846 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7849 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7850 const X86Subtarget &Subtarget,
7851 SelectionDAG & DAG) {
7853 MVT ResVT = Op.getSimpleValueType();
7854 unsigned NumOfOperands = Op.getNumOperands();
7856 assert(isPowerOf2_32(NumOfOperands) &&
7857 "Unexpected number of operands in CONCAT_VECTORS");
7859 SDValue Undef = DAG.getUNDEF(ResVT);
7860 if (NumOfOperands > 2) {
7861 // Specialize the cases when all, or all but one, of the operands are undef.
7862 unsigned NumOfDefinedOps = 0;
7864 for (unsigned i = 0; i < NumOfOperands; i++)
7865 if (!Op.getOperand(i).isUndef()) {
7869 if (NumOfDefinedOps == 0)
7871 if (NumOfDefinedOps == 1) {
7872 unsigned SubVecNumElts =
7873 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7874 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7875 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7876 Op.getOperand(OpIdx), IdxVal);
7879 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7880 ResVT.getVectorNumElements()/2);
7881 SmallVector<SDValue, 2> Ops;
7882 for (unsigned i = 0; i < NumOfOperands/2; i++)
7883 Ops.push_back(Op.getOperand(i));
7884 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7886 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7887 Ops.push_back(Op.getOperand(i));
7888 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7889 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7893 SDValue V1 = Op.getOperand(0);
7894 SDValue V2 = Op.getOperand(1);
7895 unsigned NumElems = ResVT.getVectorNumElements();
7896 assert(V1.getValueType() == V2.getValueType() &&
7897 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7898 "Unexpected operands in CONCAT_VECTORS");
7900 if (ResVT.getSizeInBits() >= 16)
7901 return Op; // The operation is legal with KUNPCK
7903 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7904 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7905 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7906 if (IsZeroV1 && IsZeroV2)
7909 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7911 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7913 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7915 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7917 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7920 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7922 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7923 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7926 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7927 const X86Subtarget &Subtarget,
7928 SelectionDAG &DAG) {
7929 MVT VT = Op.getSimpleValueType();
7930 if (VT.getVectorElementType() == MVT::i1)
7931 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7933 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7934 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7935 Op.getNumOperands() == 4)));
7937 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7938 // from two other 128-bit ones.
7940 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7941 return LowerAVXCONCAT_VECTORS(Op, DAG);
7944 //===----------------------------------------------------------------------===//
7945 // Vector shuffle lowering
7947 // This is an experimental code path for lowering vector shuffles on x86. It is
7948 // designed to handle arbitrary vector shuffles and blends, gracefully
7949 // degrading performance as necessary. It works hard to recognize idiomatic
7950 // shuffles and lower them to optimal instruction patterns without leaving
7951 // a framework that allows reasonably efficient handling of all vector shuffle
7953 //===----------------------------------------------------------------------===//
7955 /// \brief Tiny helper function to identify a no-op mask.
7957 /// This is a somewhat boring predicate function. It checks whether the mask
7958 /// array input, which is assumed to be a single-input shuffle mask of the kind
7959 /// used by the X86 shuffle instructions (not a fully general
7960 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7961 /// in-place shuffle are 'no-op's.
7962 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7963 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7964 assert(Mask[i] >= -1 && "Out of bound mask element!");
7965 if (Mask[i] >= 0 && Mask[i] != i)
7971 /// \brief Test whether there are elements crossing 128-bit lanes in this
7974 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7975 /// and we routinely test for these.
7976 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7977 int LaneSize = 128 / VT.getScalarSizeInBits();
7978 int Size = Mask.size();
7979 for (int i = 0; i < Size; ++i)
7980 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7985 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7987 /// This checks a shuffle mask to see if it is performing the same
7988 /// lane-relative shuffle in each sub-lane. This trivially implies
7989 /// that it is also not lane-crossing. It may however involve a blend from the
7990 /// same lane of a second vector.
7992 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7993 /// non-trivial to compute in the face of undef lanes. The representation is
7994 /// suitable for use with existing 128-bit shuffles as entries from the second
7995 /// vector have been remapped to [LaneSize, 2*LaneSize).
7996 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7998 SmallVectorImpl<int> &RepeatedMask) {
7999 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8000 RepeatedMask.assign(LaneSize, -1);
8001 int Size = Mask.size();
8002 for (int i = 0; i < Size; ++i) {
8003 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8006 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8007 // This entry crosses lanes, so there is no way to model this shuffle.
8010 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8011 // Adjust second vector indices to start at LaneSize instead of Size.
8012 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8013 : Mask[i] % LaneSize + LaneSize;
8014 if (RepeatedMask[i % LaneSize] < 0)
8015 // This is the first non-undef entry in this slot of a 128-bit lane.
8016 RepeatedMask[i % LaneSize] = LocalM;
8017 else if (RepeatedMask[i % LaneSize] != LocalM)
8018 // Found a mismatch with the repeated mask.
8024 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8026 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8027 SmallVectorImpl<int> &RepeatedMask) {
8028 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8031 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8033 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8034 SmallVectorImpl<int> &RepeatedMask) {
8035 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8038 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8039 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8040 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8042 SmallVectorImpl<int> &RepeatedMask) {
8043 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8044 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8045 int Size = Mask.size();
8046 for (int i = 0; i < Size; ++i) {
8047 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8048 if (Mask[i] == SM_SentinelUndef)
8050 if (Mask[i] == SM_SentinelZero) {
8051 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8053 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8056 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8057 // This entry crosses lanes, so there is no way to model this shuffle.
8060 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8061 // Adjust second vector indices to start at LaneSize instead of Size.
8063 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8064 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8065 // This is the first non-undef entry in this slot of a 128-bit lane.
8066 RepeatedMask[i % LaneSize] = LocalM;
8067 else if (RepeatedMask[i % LaneSize] != LocalM)
8068 // Found a mismatch with the repeated mask.
8074 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8077 /// This is a fast way to test a shuffle mask against a fixed pattern:
8079 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8081 /// It returns true if the mask is exactly as wide as the argument list, and
8082 /// each element of the mask is either -1 (signifying undef) or the value given
8083 /// in the argument.
8084 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8085 ArrayRef<int> ExpectedMask) {
8086 if (Mask.size() != ExpectedMask.size())
8089 int Size = Mask.size();
8091 // If the values are build vectors, we can look through them to find
8092 // equivalent inputs that make the shuffles equivalent.
8093 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8094 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8096 for (int i = 0; i < Size; ++i) {
8097 assert(Mask[i] >= -1 && "Out of bound mask element!");
8098 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8099 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8100 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8101 if (!MaskBV || !ExpectedBV ||
8102 MaskBV->getOperand(Mask[i] % Size) !=
8103 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8111 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8113 /// The masks must be exactly the same width.
8115 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8116 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8118 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8119 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8120 ArrayRef<int> ExpectedMask) {
8121 int Size = Mask.size();
8122 if (Size != (int)ExpectedMask.size())
8125 for (int i = 0; i < Size; ++i)
8126 if (Mask[i] == SM_SentinelUndef)
8128 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8130 else if (Mask[i] != ExpectedMask[i])
8136 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8138 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8139 const APInt &Zeroable) {
8140 int NumElts = Mask.size();
8141 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8143 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8144 for (int i = 0; i != NumElts; ++i) {
8146 if (M == SM_SentinelUndef)
8148 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8149 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8154 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8156 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8157 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8160 SmallVector<int, 8> Unpcklwd;
8161 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8162 /* Unary = */ false);
8163 SmallVector<int, 8> Unpckhwd;
8164 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8165 /* Unary = */ false);
8166 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8167 isTargetShuffleEquivalent(Mask, Unpckhwd));
8168 return IsUnpackwdMask;
8171 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8173 /// This helper function produces an 8-bit shuffle immediate corresponding to
8174 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8175 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8178 /// NB: We rely heavily on "undef" masks preserving the input lane.
8179 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8180 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8181 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8182 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8183 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8184 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8187 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8188 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8189 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8190 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8194 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8195 SelectionDAG &DAG) {
8196 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8199 /// \brief Compute whether each element of a shuffle is zeroable.
8201 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8202 /// Either it is an undef element in the shuffle mask, the element of the input
8203 /// referenced is undef, or the element of the input referenced is known to be
8204 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8205 /// as many lanes with this technique as possible to simplify the remaining
8207 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8208 SDValue V1, SDValue V2) {
8209 APInt Zeroable(Mask.size(), 0);
8210 V1 = peekThroughBitcasts(V1);
8211 V2 = peekThroughBitcasts(V2);
8213 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8214 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8216 int VectorSizeInBits = V1.getValueSizeInBits();
8217 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8218 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8220 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8222 // Handle the easy cases.
8223 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8228 // Determine shuffle input and normalize the mask.
8229 SDValue V = M < Size ? V1 : V2;
8232 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8233 if (V.getOpcode() != ISD::BUILD_VECTOR)
8236 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8237 // the (larger) source element must be UNDEF/ZERO.
8238 if ((Size % V.getNumOperands()) == 0) {
8239 int Scale = Size / V->getNumOperands();
8240 SDValue Op = V.getOperand(M / Scale);
8241 if (Op.isUndef() || X86::isZeroNode(Op))
8243 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8244 APInt Val = Cst->getAPIntValue();
8245 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8246 Val = Val.getLoBits(ScalarSizeInBits);
8249 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8250 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8251 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8252 Val = Val.getLoBits(ScalarSizeInBits);
8259 // If the BUILD_VECTOR has more elements then all the (smaller) source
8260 // elements must be UNDEF or ZERO.
8261 if ((V.getNumOperands() % Size) == 0) {
8262 int Scale = V->getNumOperands() / Size;
8263 bool AllZeroable = true;
8264 for (int j = 0; j < Scale; ++j) {
8265 SDValue Op = V.getOperand((M * Scale) + j);
8266 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8277 // The Shuffle result is as follow:
8278 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8279 // Each Zeroable's element correspond to a particular Mask's element.
8280 // As described in computeZeroableShuffleElements function.
8282 // The function looks for a sub-mask that the nonzero elements are in
8283 // increasing order. If such sub-mask exist. The function returns true.
8284 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8285 ArrayRef<int> Mask, const EVT &VectorType,
8286 bool &IsZeroSideLeft) {
8287 int NextElement = -1;
8288 // Check if the Mask's nonzero elements are in increasing order.
8289 for (int i = 0, e = Mask.size(); i < e; i++) {
8290 // Checks if the mask's zeros elements are built from only zeros.
8291 assert(Mask[i] >= -1 && "Out of bound mask element!");
8296 // Find the lowest non zero element
8297 if (NextElement < 0) {
8298 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8299 IsZeroSideLeft = NextElement != 0;
8301 // Exit if the mask's non zero elements are not in increasing order.
8302 if (NextElement != Mask[i])
8309 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8310 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8311 ArrayRef<int> Mask, SDValue V1,
8313 const APInt &Zeroable,
8314 const X86Subtarget &Subtarget,
8315 SelectionDAG &DAG) {
8316 int Size = Mask.size();
8317 int LaneSize = 128 / VT.getScalarSizeInBits();
8318 const int NumBytes = VT.getSizeInBits() / 8;
8319 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8321 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8322 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8323 (Subtarget.hasBWI() && VT.is512BitVector()));
8325 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8326 // Sign bit set in i8 mask means zero element.
8327 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8330 for (int i = 0; i < NumBytes; ++i) {
8331 int M = Mask[i / NumEltBytes];
8333 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8336 if (Zeroable[i / NumEltBytes]) {
8337 PSHUFBMask[i] = ZeroMask;
8341 // We can only use a single input of V1 or V2.
8342 SDValue SrcV = (M >= Size ? V2 : V1);
8348 // PSHUFB can't cross lanes, ensure this doesn't happen.
8349 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8353 M = M * NumEltBytes + (i % NumEltBytes);
8354 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8356 assert(V && "Failed to find a source input");
8358 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8359 return DAG.getBitcast(
8360 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8361 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8364 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8365 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8368 // X86 has dedicated shuffle that can be lowered to VEXPAND
8369 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8370 const APInt &Zeroable,
8371 ArrayRef<int> Mask, SDValue &V1,
8372 SDValue &V2, SelectionDAG &DAG,
8373 const X86Subtarget &Subtarget) {
8374 bool IsLeftZeroSide = true;
8375 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8378 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8380 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8381 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8382 unsigned NumElts = VT.getVectorNumElements();
8383 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8384 "Unexpected number of vector elements");
8385 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8386 Subtarget, DAG, DL);
8387 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8388 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8389 return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
8390 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8394 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8395 unsigned &UnpackOpcode, bool IsUnary,
8396 ArrayRef<int> TargetMask, SDLoc &DL,
8398 const X86Subtarget &Subtarget) {
8399 int NumElts = VT.getVectorNumElements();
8401 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8402 for (int i = 0; i != NumElts; i += 2) {
8403 int M1 = TargetMask[i + 0];
8404 int M2 = TargetMask[i + 1];
8405 Undef1 &= (SM_SentinelUndef == M1);
8406 Undef2 &= (SM_SentinelUndef == M2);
8407 Zero1 &= isUndefOrZero(M1);
8408 Zero2 &= isUndefOrZero(M2);
8410 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8411 "Zeroable shuffle detected");
8413 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8414 SmallVector<int, 64> Unpckl, Unpckh;
8415 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8416 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8417 UnpackOpcode = X86ISD::UNPCKL;
8418 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8419 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8423 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8424 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8425 UnpackOpcode = X86ISD::UNPCKH;
8426 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8427 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8431 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8432 if (IsUnary && (Zero1 || Zero2)) {
8433 // Don't bother if we can blend instead.
8434 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8435 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8438 bool MatchLo = true, MatchHi = true;
8439 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8440 int M = TargetMask[i];
8442 // Ignore if the input is known to be zero or the index is undef.
8443 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8444 (M == SM_SentinelUndef))
8447 MatchLo &= (M == Unpckl[i]);
8448 MatchHi &= (M == Unpckh[i]);
8451 if (MatchLo || MatchHi) {
8452 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8453 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8454 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8459 // If a binary shuffle, commute and try again.
8461 ShuffleVectorSDNode::commuteMask(Unpckl);
8462 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8463 UnpackOpcode = X86ISD::UNPCKL;
8468 ShuffleVectorSDNode::commuteMask(Unpckh);
8469 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8470 UnpackOpcode = X86ISD::UNPCKH;
8479 // X86 has dedicated unpack instructions that can handle specific blend
8480 // operations: UNPCKH and UNPCKL.
8481 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8482 ArrayRef<int> Mask, SDValue V1,
8483 SDValue V2, SelectionDAG &DAG) {
8484 SmallVector<int, 8> Unpckl;
8485 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8486 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8487 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8489 SmallVector<int, 8> Unpckh;
8490 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8491 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8492 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8494 // Commute and try again.
8495 ShuffleVectorSDNode::commuteMask(Unpckl);
8496 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8497 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8499 ShuffleVectorSDNode::commuteMask(Unpckh);
8500 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8501 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8506 /// \brief Try to emit a bitmask instruction for a shuffle.
8508 /// This handles cases where we can model a blend exactly as a bitmask due to
8509 /// one of the inputs being zeroable.
8510 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8511 SDValue V2, ArrayRef<int> Mask,
8512 const APInt &Zeroable,
8513 SelectionDAG &DAG) {
8514 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8515 MVT EltVT = VT.getVectorElementType();
8516 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8517 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8518 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8520 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8523 if (Mask[i] % Size != i)
8524 return SDValue(); // Not a blend.
8526 V = Mask[i] < Size ? V1 : V2;
8527 else if (V != (Mask[i] < Size ? V1 : V2))
8528 return SDValue(); // Can only let one input through the mask.
8530 VMaskOps[i] = AllOnes;
8533 return SDValue(); // No non-zeroable elements!
8535 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8536 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8539 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8541 /// This is used as a fallback approach when first class blend instructions are
8542 /// unavailable. Currently it is only suitable for integer vectors, but could
8543 /// be generalized for floating point vectors if desirable.
8544 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8545 SDValue V2, ArrayRef<int> Mask,
8546 SelectionDAG &DAG) {
8547 assert(VT.isInteger() && "Only supports integer vector types!");
8548 MVT EltVT = VT.getVectorElementType();
8549 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8550 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8551 SmallVector<SDValue, 16> MaskOps;
8552 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8553 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8554 return SDValue(); // Shuffled input!
8555 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8558 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8559 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8560 // We have to cast V2 around.
8561 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8562 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8563 DAG.getBitcast(MaskVT, V1Mask),
8564 DAG.getBitcast(MaskVT, V2)));
8565 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8568 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8569 SDValue PreservedSrc,
8570 const X86Subtarget &Subtarget,
8573 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8574 MutableArrayRef<int> TargetMask,
8575 bool &ForceV1Zero, bool &ForceV2Zero,
8576 uint64_t &BlendMask) {
8577 bool V1IsZeroOrUndef =
8578 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8579 bool V2IsZeroOrUndef =
8580 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8583 ForceV1Zero = false, ForceV2Zero = false;
8584 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8586 // Attempt to generate the binary blend mask. If an input is zero then
8587 // we can use any lane.
8588 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8589 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8590 int M = TargetMask[i];
8591 if (M == SM_SentinelUndef)
8595 if (M == i + Size) {
8596 BlendMask |= 1ull << i;
8599 if (M == SM_SentinelZero) {
8600 if (V1IsZeroOrUndef) {
8605 if (V2IsZeroOrUndef) {
8607 BlendMask |= 1ull << i;
8608 TargetMask[i] = i + Size;
8617 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8618 uint64_t ScaledMask = 0;
8619 for (int i = 0; i != Size; ++i)
8620 if (BlendMask & (1ull << i))
8621 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8625 /// \brief Try to emit a blend instruction for a shuffle.
8627 /// This doesn't do any checks for the availability of instructions for blending
8628 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8629 /// be matched in the backend with the type given. What it does check for is
8630 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8631 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8632 SDValue V2, ArrayRef<int> Original,
8633 const APInt &Zeroable,
8634 const X86Subtarget &Subtarget,
8635 SelectionDAG &DAG) {
8636 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8638 uint64_t BlendMask = 0;
8639 bool ForceV1Zero = false, ForceV2Zero = false;
8640 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8644 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8646 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8648 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8650 switch (VT.SimpleTy) {
8655 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8656 DAG.getConstant(BlendMask, DL, MVT::i8));
8660 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8664 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8665 // that instruction.
8666 if (Subtarget.hasAVX2()) {
8667 // Scale the blend by the number of 32-bit dwords per element.
8668 int Scale = VT.getScalarSizeInBits() / 32;
8669 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8670 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8671 V1 = DAG.getBitcast(BlendVT, V1);
8672 V2 = DAG.getBitcast(BlendVT, V2);
8673 return DAG.getBitcast(
8674 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8675 DAG.getConstant(BlendMask, DL, MVT::i8)));
8679 // For integer shuffles we need to expand the mask and cast the inputs to
8680 // v8i16s prior to blending.
8681 int Scale = 8 / VT.getVectorNumElements();
8682 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8683 V1 = DAG.getBitcast(MVT::v8i16, V1);
8684 V2 = DAG.getBitcast(MVT::v8i16, V2);
8685 return DAG.getBitcast(VT,
8686 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8687 DAG.getConstant(BlendMask, DL, MVT::i8)));
8691 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8692 SmallVector<int, 8> RepeatedMask;
8693 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8694 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8695 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8697 for (int i = 0; i < 8; ++i)
8698 if (RepeatedMask[i] >= 8)
8699 BlendMask |= 1ull << i;
8700 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8701 DAG.getConstant(BlendMask, DL, MVT::i8));
8707 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8708 "256-bit byte-blends require AVX2 support!");
8710 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8712 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8713 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8714 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8717 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8718 if (SDValue Masked =
8719 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8722 // Scale the blend by the number of bytes per element.
8723 int Scale = VT.getScalarSizeInBits() / 8;
8725 // This form of blend is always done on bytes. Compute the byte vector
8727 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8729 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8730 // mix of LLVM's code generator and the x86 backend. We tell the code
8731 // generator that boolean values in the elements of an x86 vector register
8732 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8733 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8734 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8735 // of the element (the remaining are ignored) and 0 in that high bit would
8736 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8737 // the LLVM model for boolean values in vector elements gets the relevant
8738 // bit set, it is set backwards and over constrained relative to x86's
8740 SmallVector<SDValue, 32> VSELECTMask;
8741 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8742 for (int j = 0; j < Scale; ++j)
8743 VSELECTMask.push_back(
8744 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8745 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8748 V1 = DAG.getBitcast(BlendVT, V1);
8749 V2 = DAG.getBitcast(BlendVT, V2);
8750 return DAG.getBitcast(
8751 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8752 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8761 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8762 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8763 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8766 llvm_unreachable("Not a supported integer vector type!");
8770 /// \brief Try to lower as a blend of elements from two inputs followed by
8771 /// a single-input permutation.
8773 /// This matches the pattern where we can blend elements from two inputs and
8774 /// then reduce the shuffle to a single-input permutation.
8775 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8776 SDValue V1, SDValue V2,
8778 SelectionDAG &DAG) {
8779 // We build up the blend mask while checking whether a blend is a viable way
8780 // to reduce the shuffle.
8781 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8782 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8784 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8788 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8790 if (BlendMask[Mask[i] % Size] < 0)
8791 BlendMask[Mask[i] % Size] = Mask[i];
8792 else if (BlendMask[Mask[i] % Size] != Mask[i])
8793 return SDValue(); // Can't blend in the needed input!
8795 PermuteMask[i] = Mask[i] % Size;
8798 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8799 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8802 /// \brief Generic routine to decompose a shuffle and blend into independent
8803 /// blends and permutes.
8805 /// This matches the extremely common pattern for handling combined
8806 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8807 /// operations. It will try to pick the best arrangement of shuffles and
8809 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8813 SelectionDAG &DAG) {
8814 // Shuffle the input elements into the desired positions in V1 and V2 and
8815 // blend them together.
8816 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8817 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8818 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8819 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8820 if (Mask[i] >= 0 && Mask[i] < Size) {
8821 V1Mask[i] = Mask[i];
8823 } else if (Mask[i] >= Size) {
8824 V2Mask[i] = Mask[i] - Size;
8825 BlendMask[i] = i + Size;
8828 // Try to lower with the simpler initial blend strategy unless one of the
8829 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8830 // shuffle may be able to fold with a load or other benefit. However, when
8831 // we'll have to do 2x as many shuffles in order to achieve this, blending
8832 // first is a better strategy.
8833 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8834 if (SDValue BlendPerm =
8835 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8838 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8839 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8840 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8843 /// \brief Try to lower a vector shuffle as a rotation.
8845 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8846 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8847 ArrayRef<int> Mask) {
8848 int NumElts = Mask.size();
8850 // We need to detect various ways of spelling a rotation:
8851 // [11, 12, 13, 14, 15, 0, 1, 2]
8852 // [-1, 12, 13, 14, -1, -1, 1, -1]
8853 // [-1, -1, -1, -1, -1, -1, 1, 2]
8854 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8855 // [-1, 4, 5, 6, -1, -1, 9, -1]
8856 // [-1, 4, 5, 6, -1, -1, -1, -1]
8859 for (int i = 0; i < NumElts; ++i) {
8861 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8862 "Unexpected mask index.");
8866 // Determine where a rotated vector would have started.
8867 int StartIdx = i - (M % NumElts);
8869 // The identity rotation isn't interesting, stop.
8872 // If we found the tail of a vector the rotation must be the missing
8873 // front. If we found the head of a vector, it must be how much of the
8875 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8878 Rotation = CandidateRotation;
8879 else if (Rotation != CandidateRotation)
8880 // The rotations don't match, so we can't match this mask.
8883 // Compute which value this mask is pointing at.
8884 SDValue MaskV = M < NumElts ? V1 : V2;
8886 // Compute which of the two target values this index should be assigned
8887 // to. This reflects whether the high elements are remaining or the low
8888 // elements are remaining.
8889 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8891 // Either set up this value if we've not encountered it before, or check
8892 // that it remains consistent.
8895 else if (TargetV != MaskV)
8896 // This may be a rotation, but it pulls from the inputs in some
8897 // unsupported interleaving.
8901 // Check that we successfully analyzed the mask, and normalize the results.
8902 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8903 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8915 /// \brief Try to lower a vector shuffle as a byte rotation.
8917 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8918 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8919 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8920 /// try to generically lower a vector shuffle through such an pattern. It
8921 /// does not check for the profitability of lowering either as PALIGNR or
8922 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8923 /// This matches shuffle vectors that look like:
8925 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8927 /// Essentially it concatenates V1 and V2, shifts right by some number of
8928 /// elements, and takes the low elements as the result. Note that while this is
8929 /// specified as a *right shift* because x86 is little-endian, it is a *left
8930 /// rotate* of the vector lanes.
8931 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8932 ArrayRef<int> Mask) {
8933 // Don't accept any shuffles with zero elements.
8934 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8937 // PALIGNR works on 128-bit lanes.
8938 SmallVector<int, 16> RepeatedMask;
8939 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8942 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8946 // PALIGNR rotates bytes, so we need to scale the
8947 // rotation based on how many bytes are in the vector lane.
8948 int NumElts = RepeatedMask.size();
8949 int Scale = 16 / NumElts;
8950 return Rotation * Scale;
8953 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8954 SDValue V1, SDValue V2,
8956 const X86Subtarget &Subtarget,
8957 SelectionDAG &DAG) {
8958 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8960 SDValue Lo = V1, Hi = V2;
8961 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8962 if (ByteRotation <= 0)
8965 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8967 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8968 Lo = DAG.getBitcast(ByteVT, Lo);
8969 Hi = DAG.getBitcast(ByteVT, Hi);
8971 // SSSE3 targets can use the palignr instruction.
8972 if (Subtarget.hasSSSE3()) {
8973 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8974 "512-bit PALIGNR requires BWI instructions");
8975 return DAG.getBitcast(
8976 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8977 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8980 assert(VT.is128BitVector() &&
8981 "Rotate-based lowering only supports 128-bit lowering!");
8982 assert(Mask.size() <= 16 &&
8983 "Can shuffle at most 16 bytes in a 128-bit vector!");
8984 assert(ByteVT == MVT::v16i8 &&
8985 "SSE2 rotate lowering only needed for v16i8!");
8987 // Default SSE2 implementation
8988 int LoByteShift = 16 - ByteRotation;
8989 int HiByteShift = ByteRotation;
8991 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
8992 DAG.getConstant(LoByteShift, DL, MVT::i8));
8993 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
8994 DAG.getConstant(HiByteShift, DL, MVT::i8));
8995 return DAG.getBitcast(VT,
8996 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
8999 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9001 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9002 /// rotation of the concatenation of two vectors; This routine will
9003 /// try to generically lower a vector shuffle through such an pattern.
9005 /// Essentially it concatenates V1 and V2, shifts right by some number of
9006 /// elements, and takes the low elements as the result. Note that while this is
9007 /// specified as a *right shift* because x86 is little-endian, it is a *left
9008 /// rotate* of the vector lanes.
9009 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9010 SDValue V1, SDValue V2,
9012 const X86Subtarget &Subtarget,
9013 SelectionDAG &DAG) {
9014 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9015 "Only 32-bit and 64-bit elements are supported!");
9017 // 128/256-bit vectors are only supported with VLX.
9018 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9019 && "VLX required for 128/256-bit vectors");
9021 SDValue Lo = V1, Hi = V2;
9022 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9026 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9027 DAG.getConstant(Rotation, DL, MVT::i8));
9030 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9032 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9033 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9034 /// matches elements from one of the input vectors shuffled to the left or
9035 /// right with zeroable elements 'shifted in'. It handles both the strictly
9036 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9039 /// PSHL : (little-endian) left bit shift.
9040 /// [ zz, 0, zz, 2 ]
9041 /// [ -1, 4, zz, -1 ]
9042 /// PSRL : (little-endian) right bit shift.
9044 /// [ -1, -1, 7, zz]
9045 /// PSLLDQ : (little-endian) left byte shift
9046 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9047 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9048 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9049 /// PSRLDQ : (little-endian) right byte shift
9050 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9051 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9052 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9053 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9054 unsigned ScalarSizeInBits,
9055 ArrayRef<int> Mask, int MaskOffset,
9056 const APInt &Zeroable,
9057 const X86Subtarget &Subtarget) {
9058 int Size = Mask.size();
9059 unsigned SizeInBits = Size * ScalarSizeInBits;
9061 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9062 for (int i = 0; i < Size; i += Scale)
9063 for (int j = 0; j < Shift; ++j)
9064 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9070 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9071 for (int i = 0; i != Size; i += Scale) {
9072 unsigned Pos = Left ? i + Shift : i;
9073 unsigned Low = Left ? i : i + Shift;
9074 unsigned Len = Scale - Shift;
9075 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9079 int ShiftEltBits = ScalarSizeInBits * Scale;
9080 bool ByteShift = ShiftEltBits > 64;
9081 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9082 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9083 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9085 // Normalize the scale for byte shifts to still produce an i64 element
9087 Scale = ByteShift ? Scale / 2 : Scale;
9089 // We need to round trip through the appropriate type for the shift.
9090 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9091 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9092 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9093 return (int)ShiftAmt;
9096 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9097 // keep doubling the size of the integer elements up to that. We can
9098 // then shift the elements of the integer vector by whole multiples of
9099 // their width within the elements of the larger integer vector. Test each
9100 // multiple to see if we can find a match with the moved element indices
9101 // and that the shifted in elements are all zeroable.
9102 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9103 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9104 for (int Shift = 1; Shift != Scale; ++Shift)
9105 for (bool Left : {true, false})
9106 if (CheckZeros(Shift, Scale, Left)) {
9107 int ShiftAmt = MatchShift(Shift, Scale, Left);
9116 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9117 SDValue V2, ArrayRef<int> Mask,
9118 const APInt &Zeroable,
9119 const X86Subtarget &Subtarget,
9120 SelectionDAG &DAG) {
9121 int Size = Mask.size();
9122 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9128 // Try to match shuffle against V1 shift.
9129 int ShiftAmt = matchVectorShuffleAsShift(
9130 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9132 // If V1 failed, try to match shuffle against V2 shift.
9135 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9136 Mask, Size, Zeroable, Subtarget);
9143 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9144 "Illegal integer vector type");
9145 V = DAG.getBitcast(ShiftVT, V);
9146 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9147 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9148 return DAG.getBitcast(VT, V);
9151 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9152 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9153 SDValue V2, ArrayRef<int> Mask,
9154 const APInt &Zeroable,
9155 SelectionDAG &DAG) {
9156 int Size = Mask.size();
9157 int HalfSize = Size / 2;
9158 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9159 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9161 // Upper half must be undefined.
9162 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9165 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9166 // Remainder of lower half result is zero and upper half is all undef.
9167 auto LowerAsEXTRQ = [&]() {
9168 // Determine the extraction length from the part of the
9169 // lower half that isn't zeroable.
9171 for (; Len > 0; --Len)
9172 if (!Zeroable[Len - 1])
9174 assert(Len > 0 && "Zeroable shuffle mask");
9176 // Attempt to match first Len sequential elements from the lower half.
9179 for (int i = 0; i != Len; ++i) {
9183 SDValue &V = (M < Size ? V1 : V2);
9186 // The extracted elements must start at a valid index and all mask
9187 // elements must be in the lower half.
9188 if (i > M || M >= HalfSize)
9191 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9202 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9203 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9204 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9205 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9206 DAG.getConstant(BitLen, DL, MVT::i8),
9207 DAG.getConstant(BitIdx, DL, MVT::i8));
9210 if (SDValue ExtrQ = LowerAsEXTRQ())
9213 // INSERTQ: Extract lowest Len elements from lower half of second source and
9214 // insert over first source, starting at Idx.
9215 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9216 auto LowerAsInsertQ = [&]() {
9217 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9220 // Attempt to match first source from mask before insertion point.
9221 if (isUndefInRange(Mask, 0, Idx)) {
9223 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9225 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9231 // Extend the extraction length looking to match both the insertion of
9232 // the second source and the remaining elements of the first.
9233 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9238 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9240 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9246 // Match the remaining elements of the lower half.
9247 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9249 } else if ((!Base || (Base == V1)) &&
9250 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9252 } else if ((!Base || (Base == V2)) &&
9253 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9260 // We may not have a base (first source) - this can safely be undefined.
9262 Base = DAG.getUNDEF(VT);
9264 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9265 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9266 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9267 DAG.getConstant(BitLen, DL, MVT::i8),
9268 DAG.getConstant(BitIdx, DL, MVT::i8));
9275 if (SDValue InsertQ = LowerAsInsertQ())
9281 /// \brief Lower a vector shuffle as a zero or any extension.
9283 /// Given a specific number of elements, element bit width, and extension
9284 /// stride, produce either a zero or any extension based on the available
9285 /// features of the subtarget. The extended elements are consecutive and
9286 /// begin and can start from an offsetted element index in the input; to
9287 /// avoid excess shuffling the offset must either being in the bottom lane
9288 /// or at the start of a higher lane. All extended elements must be from
9290 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9291 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9292 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9293 assert(Scale > 1 && "Need a scale to extend.");
9294 int EltBits = VT.getScalarSizeInBits();
9295 int NumElements = VT.getVectorNumElements();
9296 int NumEltsPerLane = 128 / EltBits;
9297 int OffsetLane = Offset / NumEltsPerLane;
9298 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9299 "Only 8, 16, and 32 bit elements can be extended.");
9300 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9301 assert(0 <= Offset && "Extension offset must be positive.");
9302 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9303 "Extension offset must be in the first lane or start an upper lane.");
9305 // Check that an index is in same lane as the base offset.
9306 auto SafeOffset = [&](int Idx) {
9307 return OffsetLane == (Idx / NumEltsPerLane);
9310 // Shift along an input so that the offset base moves to the first element.
9311 auto ShuffleOffset = [&](SDValue V) {
9315 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9316 for (int i = 0; i * Scale < NumElements; ++i) {
9317 int SrcIdx = i + Offset;
9318 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9320 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9323 // Found a valid zext mask! Try various lowering strategies based on the
9324 // input type and available ISA extensions.
9325 if (Subtarget.hasSSE41()) {
9326 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9327 // PUNPCK will catch this in a later shuffle match.
9328 if (Offset && Scale == 2 && VT.is128BitVector())
9330 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9331 NumElements / Scale);
9332 InputV = ShuffleOffset(InputV);
9333 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9334 return DAG.getBitcast(VT, InputV);
9337 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9339 // For any extends we can cheat for larger element sizes and use shuffle
9340 // instructions that can fold with a load and/or copy.
9341 if (AnyExt && EltBits == 32) {
9342 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9344 return DAG.getBitcast(
9345 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9346 DAG.getBitcast(MVT::v4i32, InputV),
9347 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9349 if (AnyExt && EltBits == 16 && Scale > 2) {
9350 int PSHUFDMask[4] = {Offset / 2, -1,
9351 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9352 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9353 DAG.getBitcast(MVT::v4i32, InputV),
9354 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9355 int PSHUFWMask[4] = {1, -1, -1, -1};
9356 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9357 return DAG.getBitcast(
9358 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9359 DAG.getBitcast(MVT::v8i16, InputV),
9360 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9363 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9365 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9366 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9367 assert(VT.is128BitVector() && "Unexpected vector width!");
9369 int LoIdx = Offset * EltBits;
9370 SDValue Lo = DAG.getBitcast(
9371 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9372 DAG.getConstant(EltBits, DL, MVT::i8),
9373 DAG.getConstant(LoIdx, DL, MVT::i8)));
9375 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9376 !SafeOffset(Offset + 1))
9377 return DAG.getBitcast(VT, Lo);
9379 int HiIdx = (Offset + 1) * EltBits;
9380 SDValue Hi = DAG.getBitcast(
9381 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9382 DAG.getConstant(EltBits, DL, MVT::i8),
9383 DAG.getConstant(HiIdx, DL, MVT::i8)));
9384 return DAG.getBitcast(VT,
9385 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9388 // If this would require more than 2 unpack instructions to expand, use
9389 // pshufb when available. We can only use more than 2 unpack instructions
9390 // when zero extending i8 elements which also makes it easier to use pshufb.
9391 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9392 assert(NumElements == 16 && "Unexpected byte vector width!");
9393 SDValue PSHUFBMask[16];
9394 for (int i = 0; i < 16; ++i) {
9395 int Idx = Offset + (i / Scale);
9396 PSHUFBMask[i] = DAG.getConstant(
9397 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9399 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9400 return DAG.getBitcast(
9401 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9402 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9405 // If we are extending from an offset, ensure we start on a boundary that
9406 // we can unpack from.
9407 int AlignToUnpack = Offset % (NumElements / Scale);
9408 if (AlignToUnpack) {
9409 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9410 for (int i = AlignToUnpack; i < NumElements; ++i)
9411 ShMask[i - AlignToUnpack] = i;
9412 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9413 Offset -= AlignToUnpack;
9416 // Otherwise emit a sequence of unpacks.
9418 unsigned UnpackLoHi = X86ISD::UNPCKL;
9419 if (Offset >= (NumElements / 2)) {
9420 UnpackLoHi = X86ISD::UNPCKH;
9421 Offset -= (NumElements / 2);
9424 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9425 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9426 : getZeroVector(InputVT, Subtarget, DAG, DL);
9427 InputV = DAG.getBitcast(InputVT, InputV);
9428 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9432 } while (Scale > 1);
9433 return DAG.getBitcast(VT, InputV);
9436 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9438 /// This routine will try to do everything in its power to cleverly lower
9439 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9440 /// check for the profitability of this lowering, it tries to aggressively
9441 /// match this pattern. It will use all of the micro-architectural details it
9442 /// can to emit an efficient lowering. It handles both blends with all-zero
9443 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9444 /// masking out later).
9446 /// The reason we have dedicated lowering for zext-style shuffles is that they
9447 /// are both incredibly common and often quite performance sensitive.
9448 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9449 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9450 const APInt &Zeroable, const X86Subtarget &Subtarget,
9451 SelectionDAG &DAG) {
9452 int Bits = VT.getSizeInBits();
9453 int NumLanes = Bits / 128;
9454 int NumElements = VT.getVectorNumElements();
9455 int NumEltsPerLane = NumElements / NumLanes;
9456 assert(VT.getScalarSizeInBits() <= 32 &&
9457 "Exceeds 32-bit integer zero extension limit");
9458 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9460 // Define a helper function to check a particular ext-scale and lower to it if
9462 auto Lower = [&](int Scale) -> SDValue {
9467 for (int i = 0; i < NumElements; ++i) {
9470 continue; // Valid anywhere but doesn't tell us anything.
9471 if (i % Scale != 0) {
9472 // Each of the extended elements need to be zeroable.
9476 // We no longer are in the anyext case.
9481 // Each of the base elements needs to be consecutive indices into the
9482 // same input vector.
9483 SDValue V = M < NumElements ? V1 : V2;
9484 M = M % NumElements;
9487 Offset = M - (i / Scale);
9488 } else if (InputV != V)
9489 return SDValue(); // Flip-flopping inputs.
9491 // Offset must start in the lowest 128-bit lane or at the start of an
9493 // FIXME: Is it ever worth allowing a negative base offset?
9494 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9495 (Offset % NumEltsPerLane) == 0))
9498 // If we are offsetting, all referenced entries must come from the same
9500 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9503 if ((M % NumElements) != (Offset + (i / Scale)))
9504 return SDValue(); // Non-consecutive strided elements.
9508 // If we fail to find an input, we have a zero-shuffle which should always
9509 // have already been handled.
9510 // FIXME: Maybe handle this here in case during blending we end up with one?
9514 // If we are offsetting, don't extend if we only match a single input, we
9515 // can always do better by using a basic PSHUF or PUNPCK.
9516 if (Offset != 0 && Matches < 2)
9519 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9520 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9523 // The widest scale possible for extending is to a 64-bit integer.
9524 assert(Bits % 64 == 0 &&
9525 "The number of bits in a vector must be divisible by 64 on x86!");
9526 int NumExtElements = Bits / 64;
9528 // Each iteration, try extending the elements half as much, but into twice as
9530 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9531 assert(NumElements % NumExtElements == 0 &&
9532 "The input vector size must be divisible by the extended size.");
9533 if (SDValue V = Lower(NumElements / NumExtElements))
9537 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9541 // Returns one of the source operands if the shuffle can be reduced to a
9542 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9543 auto CanZExtLowHalf = [&]() {
9544 for (int i = NumElements / 2; i != NumElements; ++i)
9547 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9549 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9554 if (SDValue V = CanZExtLowHalf()) {
9555 V = DAG.getBitcast(MVT::v2i64, V);
9556 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9557 return DAG.getBitcast(VT, V);
9560 // No viable ext lowering found.
9564 /// \brief Try to get a scalar value for a specific element of a vector.
9566 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9567 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9568 SelectionDAG &DAG) {
9569 MVT VT = V.getSimpleValueType();
9570 MVT EltVT = VT.getVectorElementType();
9571 V = peekThroughBitcasts(V);
9573 // If the bitcasts shift the element size, we can't extract an equivalent
9575 MVT NewVT = V.getSimpleValueType();
9576 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9579 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9580 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9581 // Ensure the scalar operand is the same size as the destination.
9582 // FIXME: Add support for scalar truncation where possible.
9583 SDValue S = V.getOperand(Idx);
9584 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9585 return DAG.getBitcast(EltVT, S);
9591 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9593 /// This is particularly important because the set of instructions varies
9594 /// significantly based on whether the operand is a load or not.
9595 static bool isShuffleFoldableLoad(SDValue V) {
9596 V = peekThroughBitcasts(V);
9597 return ISD::isNON_EXTLoad(V.getNode());
9600 /// \brief Try to lower insertion of a single element into a zero vector.
9602 /// This is a common pattern that we have especially efficient patterns to lower
9603 /// across all subtarget feature sets.
9604 static SDValue lowerVectorShuffleAsElementInsertion(
9605 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9606 const APInt &Zeroable, const X86Subtarget &Subtarget,
9607 SelectionDAG &DAG) {
9609 MVT EltVT = VT.getVectorElementType();
9612 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9614 bool IsV1Zeroable = true;
9615 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9616 if (i != V2Index && !Zeroable[i]) {
9617 IsV1Zeroable = false;
9621 // Check for a single input from a SCALAR_TO_VECTOR node.
9622 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9623 // all the smarts here sunk into that routine. However, the current
9624 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9625 // vector shuffle lowering is dead.
9626 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9628 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9629 // We need to zext the scalar if it is smaller than an i32.
9630 V2S = DAG.getBitcast(EltVT, V2S);
9631 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9632 // Using zext to expand a narrow element won't work for non-zero
9637 // Zero-extend directly to i32.
9639 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9641 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9642 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9643 EltVT == MVT::i16) {
9644 // Either not inserting from the low element of the input or the input
9645 // element size is too small to use VZEXT_MOVL to clear the high bits.
9649 if (!IsV1Zeroable) {
9650 // If V1 can't be treated as a zero vector we have fewer options to lower
9651 // this. We can't support integer vectors or non-zero targets cheaply, and
9652 // the V1 elements can't be permuted in any way.
9653 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9654 if (!VT.isFloatingPoint() || V2Index != 0)
9656 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9657 V1Mask[V2Index] = -1;
9658 if (!isNoopShuffleMask(V1Mask))
9660 // This is essentially a special case blend operation, but if we have
9661 // general purpose blend operations, they are always faster. Bail and let
9662 // the rest of the lowering handle these as blends.
9663 if (Subtarget.hasSSE41())
9666 // Otherwise, use MOVSD or MOVSS.
9667 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9668 "Only two types of floating point element types to handle!");
9669 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9673 // This lowering only works for the low element with floating point vectors.
9674 if (VT.isFloatingPoint() && V2Index != 0)
9677 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9679 V2 = DAG.getBitcast(VT, V2);
9682 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9683 // the desired position. Otherwise it is more efficient to do a vector
9684 // shift left. We know that we can do a vector shift left because all
9685 // the inputs are zero.
9686 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9687 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9688 V2Shuffle[V2Index] = 0;
9689 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9691 V2 = DAG.getBitcast(MVT::v16i8, V2);
9693 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9694 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9695 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9696 DAG.getDataLayout(), VT)));
9697 V2 = DAG.getBitcast(VT, V2);
9703 /// Try to lower broadcast of a single - truncated - integer element,
9704 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9706 /// This assumes we have AVX2.
9707 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9708 SDValue V0, int BroadcastIdx,
9709 const X86Subtarget &Subtarget,
9710 SelectionDAG &DAG) {
9711 assert(Subtarget.hasAVX2() &&
9712 "We can only lower integer broadcasts with AVX2!");
9714 EVT EltVT = VT.getVectorElementType();
9715 EVT V0VT = V0.getValueType();
9717 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9718 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9720 EVT V0EltVT = V0VT.getVectorElementType();
9721 if (!V0EltVT.isInteger())
9724 const unsigned EltSize = EltVT.getSizeInBits();
9725 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9727 // This is only a truncation if the original element type is larger.
9728 if (V0EltSize <= EltSize)
9731 assert(((V0EltSize % EltSize) == 0) &&
9732 "Scalar type sizes must all be powers of 2 on x86!");
9734 const unsigned V0Opc = V0.getOpcode();
9735 const unsigned Scale = V0EltSize / EltSize;
9736 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9738 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9739 V0Opc != ISD::BUILD_VECTOR)
9742 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9744 // If we're extracting non-least-significant bits, shift so we can truncate.
9745 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9746 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9747 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9748 if (const int OffsetIdx = BroadcastIdx % Scale)
9749 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9750 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9752 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9753 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9756 /// \brief Try to lower broadcast of a single element.
9758 /// For convenience, this code also bundles all of the subtarget feature set
9759 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9760 /// a convenient way to factor it out.
9761 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9762 SDValue V1, SDValue V2,
9764 const X86Subtarget &Subtarget,
9765 SelectionDAG &DAG) {
9766 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9767 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9768 (Subtarget.hasAVX2() && VT.isInteger())))
9771 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9772 // we can only broadcast from a register with AVX2.
9773 unsigned NumElts = Mask.size();
9774 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9775 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9777 // Check that the mask is a broadcast.
9778 int BroadcastIdx = -1;
9779 for (int i = 0; i != (int)NumElts; ++i) {
9780 SmallVector<int, 8> BroadcastMask(NumElts, i);
9781 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9787 if (BroadcastIdx < 0)
9789 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9790 "a sorted mask where the broadcast "
9793 // Go up the chain of (vector) values to find a scalar load that we can
9794 // combine with the broadcast.
9797 switch (V.getOpcode()) {
9798 case ISD::BITCAST: {
9799 SDValue VSrc = V.getOperand(0);
9800 MVT SrcVT = VSrc.getSimpleValueType();
9801 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9806 case ISD::CONCAT_VECTORS: {
9807 int OperandSize = Mask.size() / V.getNumOperands();
9808 V = V.getOperand(BroadcastIdx / OperandSize);
9809 BroadcastIdx %= OperandSize;
9812 case ISD::INSERT_SUBVECTOR: {
9813 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9814 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9818 int BeginIdx = (int)ConstantIdx->getZExtValue();
9820 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9821 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9822 BroadcastIdx -= BeginIdx;
9833 // Check if this is a broadcast of a scalar. We special case lowering
9834 // for scalars so that we can more effectively fold with loads.
9835 // First, look through bitcast: if the original value has a larger element
9836 // type than the shuffle, the broadcast element is in essence truncated.
9837 // Make that explicit to ease folding.
9838 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9839 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9840 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9841 return TruncBroadcast;
9843 MVT BroadcastVT = VT;
9845 // Peek through any bitcast (only useful for loads).
9846 SDValue BC = peekThroughBitcasts(V);
9848 // Also check the simpler case, where we can directly reuse the scalar.
9849 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9850 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9851 V = V.getOperand(BroadcastIdx);
9853 // If we can't broadcast from a register, check that the input is a load.
9854 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9856 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9857 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9858 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9859 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9860 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9863 // If we are broadcasting a load that is only used by the shuffle
9864 // then we can reduce the vector load to the broadcasted scalar load.
9865 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9866 SDValue BaseAddr = Ld->getOperand(1);
9867 EVT SVT = BroadcastVT.getScalarType();
9868 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9869 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9870 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9871 DAG.getMachineFunction().getMachineMemOperand(
9872 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9874 // Make sure the newly-created LOAD is in the same position as Ld in
9875 // terms of dependency. We create a TokenFactor for Ld and V,
9876 // and update uses of Ld's output chain to use the TokenFactor.
9877 if (Ld->hasAnyUseOfValue(1)) {
9878 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9879 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9880 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9881 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9882 SDValue(V.getNode(), 1));
9884 } else if (!BroadcastFromReg) {
9885 // We can't broadcast from a vector register.
9887 } else if (BroadcastIdx != 0) {
9888 // We can only broadcast from the zero-element of a vector register,
9889 // but it can be advantageous to broadcast from the zero-element of a
9891 if (!VT.is256BitVector() && !VT.is512BitVector())
9894 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9895 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9898 // Only broadcast the zero-element of a 128-bit subvector.
9899 unsigned EltSize = VT.getScalarSizeInBits();
9900 if (((BroadcastIdx * EltSize) % 128) != 0)
9903 // The shuffle input might have been a bitcast we looked through; look at
9904 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
9905 // later bitcast it to BroadcastVT.
9906 MVT SrcVT = V.getSimpleValueType();
9907 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9908 "Unexpected vector element size");
9909 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
9910 "Unexpected vector size");
9912 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
9913 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9914 DAG.getIntPtrConstant(BroadcastIdx, DL));
9917 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9918 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9919 DAG.getBitcast(MVT::f64, V));
9921 // Bitcast back to the same scalar type as BroadcastVT.
9922 MVT SrcVT = V.getSimpleValueType();
9923 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9924 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9925 "Unexpected vector element size");
9926 if (SrcVT.isVector()) {
9927 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9928 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9930 SrcVT = BroadcastVT.getScalarType();
9932 V = DAG.getBitcast(SrcVT, V);
9935 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9936 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9937 V = DAG.getBitcast(MVT::f64, V);
9938 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9939 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9942 // We only support broadcasting from 128-bit vectors to minimize the
9943 // number of patterns we need to deal with in isel. So extract down to
9945 if (SrcVT.getSizeInBits() > 128)
9946 V = extract128BitVector(V, 0, DAG, DL);
9948 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9951 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9952 // INSERTPS when the V1 elements are already in the correct locations
9953 // because otherwise we can just always use two SHUFPS instructions which
9954 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9955 // perform INSERTPS if a single V1 element is out of place and all V2
9956 // elements are zeroable.
9957 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9958 unsigned &InsertPSMask,
9959 const APInt &Zeroable,
9961 SelectionDAG &DAG) {
9962 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9963 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9964 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9966 // Attempt to match INSERTPS with one element from VA or VB being
9967 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9969 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9970 ArrayRef<int> CandidateMask) {
9972 int VADstIndex = -1;
9973 int VBDstIndex = -1;
9974 bool VAUsedInPlace = false;
9976 for (int i = 0; i < 4; ++i) {
9977 // Synthesize a zero mask from the zeroable elements (includes undefs).
9983 // Flag if we use any VA inputs in place.
9984 if (i == CandidateMask[i]) {
9985 VAUsedInPlace = true;
9989 // We can only insert a single non-zeroable element.
9990 if (VADstIndex >= 0 || VBDstIndex >= 0)
9993 if (CandidateMask[i] < 4) {
9994 // VA input out of place for insertion.
9997 // VB input for insertion.
10002 // Don't bother if we have no (non-zeroable) element for insertion.
10003 if (VADstIndex < 0 && VBDstIndex < 0)
10006 // Determine element insertion src/dst indices. The src index is from the
10007 // start of the inserted vector, not the start of the concatenated vector.
10008 unsigned VBSrcIndex = 0;
10009 if (VADstIndex >= 0) {
10010 // If we have a VA input out of place, we use VA as the V2 element
10011 // insertion and don't use the original V2 at all.
10012 VBSrcIndex = CandidateMask[VADstIndex];
10013 VBDstIndex = VADstIndex;
10016 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10019 // If no V1 inputs are used in place, then the result is created only from
10020 // the zero mask and the V2 insertion - so remove V1 dependency.
10021 if (!VAUsedInPlace)
10022 VA = DAG.getUNDEF(MVT::v4f32);
10024 // Update V1, V2 and InsertPSMask accordingly.
10028 // Insert the V2 element into the desired position.
10029 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10030 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10034 if (matchAsInsertPS(V1, V2, Mask))
10037 // Commute and try again.
10038 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10039 ShuffleVectorSDNode::commuteMask(CommutedMask);
10040 if (matchAsInsertPS(V2, V1, CommutedMask))
10046 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10047 SDValue V2, ArrayRef<int> Mask,
10048 const APInt &Zeroable,
10049 SelectionDAG &DAG) {
10050 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10051 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10053 // Attempt to match the insertps pattern.
10054 unsigned InsertPSMask;
10055 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10058 // Insert the V2 element into the desired position.
10059 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10060 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10063 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10064 /// UNPCK instruction.
10066 /// This specifically targets cases where we end up with alternating between
10067 /// the two inputs, and so can permute them into something that feeds a single
10068 /// UNPCK instruction. Note that this routine only targets integer vectors
10069 /// because for floating point vectors we have a generalized SHUFPS lowering
10070 /// strategy that handles everything that doesn't *exactly* match an unpack,
10071 /// making this clever lowering unnecessary.
10072 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10073 SDValue V1, SDValue V2,
10074 ArrayRef<int> Mask,
10075 SelectionDAG &DAG) {
10076 assert(!VT.isFloatingPoint() &&
10077 "This routine only supports integer vectors.");
10078 assert(VT.is128BitVector() &&
10079 "This routine only works on 128-bit vectors.");
10080 assert(!V2.isUndef() &&
10081 "This routine should only be used when blending two inputs.");
10082 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10084 int Size = Mask.size();
10087 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10089 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10091 bool UnpackLo = NumLoInputs >= NumHiInputs;
10093 auto TryUnpack = [&](int ScalarSize, int Scale) {
10094 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10095 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10097 for (int i = 0; i < Size; ++i) {
10101 // Each element of the unpack contains Scale elements from this mask.
10102 int UnpackIdx = i / Scale;
10104 // We only handle the case where V1 feeds the first slots of the unpack.
10105 // We rely on canonicalization to ensure this is the case.
10106 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10109 // Setup the mask for this input. The indexing is tricky as we have to
10110 // handle the unpack stride.
10111 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10112 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10116 // If we will have to shuffle both inputs to use the unpack, check whether
10117 // we can just unpack first and shuffle the result. If so, skip this unpack.
10118 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10119 !isNoopShuffleMask(V2Mask))
10122 // Shuffle the inputs into place.
10123 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10124 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10126 // Cast the inputs to the type we will use to unpack them.
10127 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10128 V1 = DAG.getBitcast(UnpackVT, V1);
10129 V2 = DAG.getBitcast(UnpackVT, V2);
10131 // Unpack the inputs and cast the result back to the desired type.
10132 return DAG.getBitcast(
10133 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10134 UnpackVT, V1, V2));
10137 // We try each unpack from the largest to the smallest to try and find one
10138 // that fits this mask.
10139 int OrigScalarSize = VT.getScalarSizeInBits();
10140 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10141 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10144 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10146 if (NumLoInputs == 0 || NumHiInputs == 0) {
10147 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10148 "We have to have *some* inputs!");
10149 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10151 // FIXME: We could consider the total complexity of the permute of each
10152 // possible unpacking. Or at the least we should consider how many
10153 // half-crossings are created.
10154 // FIXME: We could consider commuting the unpacks.
10156 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10157 for (int i = 0; i < Size; ++i) {
10161 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10164 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10166 return DAG.getVectorShuffle(
10167 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10169 DAG.getUNDEF(VT), PermMask);
10175 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10177 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10178 /// support for floating point shuffles but not integer shuffles. These
10179 /// instructions will incur a domain crossing penalty on some chips though so
10180 /// it is better to avoid lowering through this for integer vectors where
10182 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10183 const APInt &Zeroable,
10184 SDValue V1, SDValue V2,
10185 const X86Subtarget &Subtarget,
10186 SelectionDAG &DAG) {
10187 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10188 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10189 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10191 if (V2.isUndef()) {
10192 // Check for being able to broadcast a single element.
10193 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10194 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10197 // Straight shuffle of a single input vector. Simulate this by using the
10198 // single input as both of the "inputs" to this instruction..
10199 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10201 if (Subtarget.hasAVX()) {
10202 // If we have AVX, we can use VPERMILPS which will allow folding a load
10203 // into the shuffle.
10204 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10205 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10208 return DAG.getNode(
10209 X86ISD::SHUFP, DL, MVT::v2f64,
10210 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10211 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10212 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10214 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10215 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10217 // If we have a single input, insert that into V1 if we can do so cheaply.
10218 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10219 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10220 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10222 // Try inverting the insertion since for v2 masks it is easy to do and we
10223 // can't reliably sort the mask one way or the other.
10224 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10225 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10226 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10227 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10231 // Try to use one of the special instruction patterns to handle two common
10232 // blend patterns if a zero-blend above didn't work.
10233 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10234 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10235 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10236 // We can either use a special instruction to load over the low double or
10237 // to move just the low double.
10238 return DAG.getNode(
10239 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10240 DL, MVT::v2f64, V2,
10241 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10243 if (Subtarget.hasSSE41())
10244 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10245 Zeroable, Subtarget, DAG))
10248 // Use dedicated unpack instructions for masks that match their pattern.
10250 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10253 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10254 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10255 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10258 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10260 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10261 /// the integer unit to minimize domain crossing penalties. However, for blends
10262 /// it falls back to the floating point shuffle operation with appropriate bit
10264 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10265 const APInt &Zeroable,
10266 SDValue V1, SDValue V2,
10267 const X86Subtarget &Subtarget,
10268 SelectionDAG &DAG) {
10269 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10270 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10271 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10273 if (V2.isUndef()) {
10274 // Check for being able to broadcast a single element.
10275 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10276 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10279 // Straight shuffle of a single input vector. For everything from SSE2
10280 // onward this has a single fast instruction with no scary immediates.
10281 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10282 V1 = DAG.getBitcast(MVT::v4i32, V1);
10283 int WidenedMask[4] = {
10284 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10285 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10286 return DAG.getBitcast(
10288 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10289 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10291 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10292 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10293 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10294 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10296 // If we have a blend of two same-type PACKUS operations and the blend aligns
10297 // with the low and high halves, we can just merge the PACKUS operations.
10298 // This is particularly important as it lets us merge shuffles that this
10299 // routine itself creates.
10300 auto GetPackNode = [](SDValue V) {
10301 V = peekThroughBitcasts(V);
10302 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10304 if (SDValue V1Pack = GetPackNode(V1))
10305 if (SDValue V2Pack = GetPackNode(V2)) {
10306 EVT PackVT = V1Pack.getValueType();
10307 if (PackVT == V2Pack.getValueType())
10308 return DAG.getBitcast(MVT::v2i64,
10309 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10310 Mask[0] == 0 ? V1Pack.getOperand(0)
10311 : V1Pack.getOperand(1),
10312 Mask[1] == 2 ? V2Pack.getOperand(0)
10313 : V2Pack.getOperand(1)));
10316 // Try to use shift instructions.
10317 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10318 Zeroable, Subtarget, DAG))
10321 // When loading a scalar and then shuffling it into a vector we can often do
10322 // the insertion cheaply.
10323 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10324 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10326 // Try inverting the insertion since for v2 masks it is easy to do and we
10327 // can't reliably sort the mask one way or the other.
10328 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10329 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10330 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10333 // We have different paths for blend lowering, but they all must use the
10334 // *exact* same predicate.
10335 bool IsBlendSupported = Subtarget.hasSSE41();
10336 if (IsBlendSupported)
10337 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10338 Zeroable, Subtarget, DAG))
10341 // Use dedicated unpack instructions for masks that match their pattern.
10343 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10346 // Try to use byte rotation instructions.
10347 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10348 if (Subtarget.hasSSSE3())
10349 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10350 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10353 // If we have direct support for blends, we should lower by decomposing into
10354 // a permute. That will be faster than the domain cross.
10355 if (IsBlendSupported)
10356 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10359 // We implement this with SHUFPD which is pretty lame because it will likely
10360 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10361 // However, all the alternatives are still more cycles and newer chips don't
10362 // have this problem. It would be really nice if x86 had better shuffles here.
10363 V1 = DAG.getBitcast(MVT::v2f64, V1);
10364 V2 = DAG.getBitcast(MVT::v2f64, V2);
10365 return DAG.getBitcast(MVT::v2i64,
10366 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10369 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10371 /// This is used to disable more specialized lowerings when the shufps lowering
10372 /// will happen to be efficient.
10373 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10374 // This routine only handles 128-bit shufps.
10375 assert(Mask.size() == 4 && "Unsupported mask size!");
10376 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10377 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10378 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10379 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10381 // To lower with a single SHUFPS we need to have the low half and high half
10382 // each requiring a single input.
10383 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10385 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10391 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10393 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10394 /// It makes no assumptions about whether this is the *best* lowering, it simply
10396 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10397 ArrayRef<int> Mask, SDValue V1,
10398 SDValue V2, SelectionDAG &DAG) {
10399 SDValue LowV = V1, HighV = V2;
10400 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10402 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10404 if (NumV2Elements == 1) {
10405 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10407 // Compute the index adjacent to V2Index and in the same half by toggling
10409 int V2AdjIndex = V2Index ^ 1;
10411 if (Mask[V2AdjIndex] < 0) {
10412 // Handles all the cases where we have a single V2 element and an undef.
10413 // This will only ever happen in the high lanes because we commute the
10414 // vector otherwise.
10416 std::swap(LowV, HighV);
10417 NewMask[V2Index] -= 4;
10419 // Handle the case where the V2 element ends up adjacent to a V1 element.
10420 // To make this work, blend them together as the first step.
10421 int V1Index = V2AdjIndex;
10422 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10423 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10424 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10426 // Now proceed to reconstruct the final blend as we have the necessary
10427 // high or low half formed.
10434 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10435 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10437 } else if (NumV2Elements == 2) {
10438 if (Mask[0] < 4 && Mask[1] < 4) {
10439 // Handle the easy case where we have V1 in the low lanes and V2 in the
10443 } else if (Mask[2] < 4 && Mask[3] < 4) {
10444 // We also handle the reversed case because this utility may get called
10445 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10446 // arrange things in the right direction.
10452 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10453 // trying to place elements directly, just blend them and set up the final
10454 // shuffle to place them.
10456 // The first two blend mask elements are for V1, the second two are for
10458 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10459 Mask[2] < 4 ? Mask[2] : Mask[3],
10460 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10461 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10462 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10463 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10465 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10468 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10469 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10470 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10471 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10474 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10475 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10478 /// \brief Lower 4-lane 32-bit floating point shuffles.
10480 /// Uses instructions exclusively from the floating point unit to minimize
10481 /// domain crossing penalties, as these are sufficient to implement all v4f32
10483 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10484 const APInt &Zeroable,
10485 SDValue V1, SDValue V2,
10486 const X86Subtarget &Subtarget,
10487 SelectionDAG &DAG) {
10488 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10489 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10490 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10492 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10494 if (NumV2Elements == 0) {
10495 // Check for being able to broadcast a single element.
10496 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10497 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10500 // Use even/odd duplicate instructions for masks that match their pattern.
10501 if (Subtarget.hasSSE3()) {
10502 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10503 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10504 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10505 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10508 if (Subtarget.hasAVX()) {
10509 // If we have AVX, we can use VPERMILPS which will allow folding a load
10510 // into the shuffle.
10511 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10512 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10515 // Otherwise, use a straight shuffle of a single input vector. We pass the
10516 // input vector to both operands to simulate this with a SHUFPS.
10517 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10518 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10521 // There are special ways we can lower some single-element blends. However, we
10522 // have custom ways we can lower more complex single-element blends below that
10523 // we defer to if both this and BLENDPS fail to match, so restrict this to
10524 // when the V2 input is targeting element 0 of the mask -- that is the fast
10526 if (NumV2Elements == 1 && Mask[0] >= 4)
10527 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10528 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10531 if (Subtarget.hasSSE41()) {
10532 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10533 Zeroable, Subtarget, DAG))
10536 // Use INSERTPS if we can complete the shuffle efficiently.
10538 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10541 if (!isSingleSHUFPSMask(Mask))
10542 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10543 DL, MVT::v4f32, V1, V2, Mask, DAG))
10547 // Use low/high mov instructions.
10548 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10549 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10550 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10551 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10553 // Use dedicated unpack instructions for masks that match their pattern.
10555 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10558 // Otherwise fall back to a SHUFPS lowering strategy.
10559 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10562 /// \brief Lower 4-lane i32 vector shuffles.
10564 /// We try to handle these with integer-domain shuffles where we can, but for
10565 /// blends we use the floating point domain blend instructions.
10566 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10567 const APInt &Zeroable,
10568 SDValue V1, SDValue V2,
10569 const X86Subtarget &Subtarget,
10570 SelectionDAG &DAG) {
10571 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10572 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10573 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10575 // Whenever we can lower this as a zext, that instruction is strictly faster
10576 // than any alternative. It also allows us to fold memory operands into the
10577 // shuffle in many cases.
10578 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10579 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10582 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10584 if (NumV2Elements == 0) {
10585 // Check for being able to broadcast a single element.
10586 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10587 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10590 // Straight shuffle of a single input vector. For everything from SSE2
10591 // onward this has a single fast instruction with no scary immediates.
10592 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10593 // but we aren't actually going to use the UNPCK instruction because doing
10594 // so prevents folding a load into this instruction or making a copy.
10595 const int UnpackLoMask[] = {0, 0, 1, 1};
10596 const int UnpackHiMask[] = {2, 2, 3, 3};
10597 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10598 Mask = UnpackLoMask;
10599 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10600 Mask = UnpackHiMask;
10602 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10603 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10606 // Try to use shift instructions.
10607 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10608 Zeroable, Subtarget, DAG))
10611 // There are special ways we can lower some single-element blends.
10612 if (NumV2Elements == 1)
10613 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10614 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10617 // We have different paths for blend lowering, but they all must use the
10618 // *exact* same predicate.
10619 bool IsBlendSupported = Subtarget.hasSSE41();
10620 if (IsBlendSupported)
10621 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10622 Zeroable, Subtarget, DAG))
10625 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10629 // Use dedicated unpack instructions for masks that match their pattern.
10631 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10634 // Try to use byte rotation instructions.
10635 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10636 if (Subtarget.hasSSSE3())
10637 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10638 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10641 // Assume that a single SHUFPS is faster than an alternative sequence of
10642 // multiple instructions (even if the CPU has a domain penalty).
10643 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10644 if (!isSingleSHUFPSMask(Mask)) {
10645 // If we have direct support for blends, we should lower by decomposing into
10646 // a permute. That will be faster than the domain cross.
10647 if (IsBlendSupported)
10648 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10651 // Try to lower by permuting the inputs into an unpack instruction.
10652 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10653 DL, MVT::v4i32, V1, V2, Mask, DAG))
10657 // We implement this with SHUFPS because it can blend from two vectors.
10658 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10659 // up the inputs, bypassing domain shift penalties that we would incur if we
10660 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10662 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10663 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10664 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10665 return DAG.getBitcast(MVT::v4i32, ShufPS);
10668 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10669 /// shuffle lowering, and the most complex part.
10671 /// The lowering strategy is to try to form pairs of input lanes which are
10672 /// targeted at the same half of the final vector, and then use a dword shuffle
10673 /// to place them onto the right half, and finally unpack the paired lanes into
10674 /// their final position.
10676 /// The exact breakdown of how to form these dword pairs and align them on the
10677 /// correct sides is really tricky. See the comments within the function for
10678 /// more of the details.
10680 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10681 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10682 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10683 /// vector, form the analogous 128-bit 8-element Mask.
10684 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10685 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10686 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10687 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10688 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10690 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10691 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10692 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10694 SmallVector<int, 4> LoInputs;
10695 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10696 std::sort(LoInputs.begin(), LoInputs.end());
10697 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10698 SmallVector<int, 4> HiInputs;
10699 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10700 std::sort(HiInputs.begin(), HiInputs.end());
10701 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10703 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10704 int NumHToL = LoInputs.size() - NumLToL;
10706 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10707 int NumHToH = HiInputs.size() - NumLToH;
10708 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10709 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10710 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10711 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10713 // If we are splatting two values from one half - one to each half, then
10714 // we can shuffle that half so each is splatted to a dword, then splat those
10715 // to their respective halves.
10716 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10718 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10719 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10720 V = DAG.getNode(ShufWOp, DL, VT, V,
10721 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10722 V = DAG.getBitcast(PSHUFDVT, V);
10723 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10724 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10725 return DAG.getBitcast(VT, V);
10728 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10729 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10730 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10731 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10733 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10734 // such inputs we can swap two of the dwords across the half mark and end up
10735 // with <=2 inputs to each half in each half. Once there, we can fall through
10736 // to the generic code below. For example:
10738 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10739 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10741 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10742 // and an existing 2-into-2 on the other half. In this case we may have to
10743 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10744 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10745 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10746 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10747 // half than the one we target for fixing) will be fixed when we re-enter this
10748 // path. We will also combine away any sequence of PSHUFD instructions that
10749 // result into a single instruction. Here is an example of the tricky case:
10751 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10752 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10754 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10756 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10757 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10759 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10760 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10762 // The result is fine to be handled by the generic logic.
10763 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10764 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10765 int AOffset, int BOffset) {
10766 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10767 "Must call this with A having 3 or 1 inputs from the A half.");
10768 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10769 "Must call this with B having 1 or 3 inputs from the B half.");
10770 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10771 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10773 bool ThreeAInputs = AToAInputs.size() == 3;
10775 // Compute the index of dword with only one word among the three inputs in
10776 // a half by taking the sum of the half with three inputs and subtracting
10777 // the sum of the actual three inputs. The difference is the remaining
10779 int ADWord, BDWord;
10780 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10781 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10782 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10783 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10784 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10785 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10786 int TripleNonInputIdx =
10787 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10788 TripleDWord = TripleNonInputIdx / 2;
10790 // We use xor with one to compute the adjacent DWord to whichever one the
10792 OneInputDWord = (OneInput / 2) ^ 1;
10794 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10795 // and BToA inputs. If there is also such a problem with the BToB and AToB
10796 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10797 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10798 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10799 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10800 // Compute how many inputs will be flipped by swapping these DWords. We
10802 // to balance this to ensure we don't form a 3-1 shuffle in the other
10804 int NumFlippedAToBInputs =
10805 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10806 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10807 int NumFlippedBToBInputs =
10808 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10809 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10810 if ((NumFlippedAToBInputs == 1 &&
10811 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10812 (NumFlippedBToBInputs == 1 &&
10813 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10814 // We choose whether to fix the A half or B half based on whether that
10815 // half has zero flipped inputs. At zero, we may not be able to fix it
10816 // with that half. We also bias towards fixing the B half because that
10817 // will more commonly be the high half, and we have to bias one way.
10818 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10819 ArrayRef<int> Inputs) {
10820 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10821 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10822 // Determine whether the free index is in the flipped dword or the
10823 // unflipped dword based on where the pinned index is. We use this bit
10824 // in an xor to conditionally select the adjacent dword.
10825 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10826 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10827 if (IsFixIdxInput == IsFixFreeIdxInput)
10829 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10830 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10831 "We need to be changing the number of flipped inputs!");
10832 int PSHUFHalfMask[] = {0, 1, 2, 3};
10833 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10834 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10836 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10838 for (int &M : Mask)
10839 if (M >= 0 && M == FixIdx)
10841 else if (M >= 0 && M == FixFreeIdx)
10844 if (NumFlippedBToBInputs != 0) {
10846 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10847 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10849 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10850 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10851 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10856 int PSHUFDMask[] = {0, 1, 2, 3};
10857 PSHUFDMask[ADWord] = BDWord;
10858 PSHUFDMask[BDWord] = ADWord;
10859 V = DAG.getBitcast(
10861 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10862 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10864 // Adjust the mask to match the new locations of A and B.
10865 for (int &M : Mask)
10866 if (M >= 0 && M/2 == ADWord)
10867 M = 2 * BDWord + M % 2;
10868 else if (M >= 0 && M/2 == BDWord)
10869 M = 2 * ADWord + M % 2;
10871 // Recurse back into this routine to re-compute state now that this isn't
10872 // a 3 and 1 problem.
10873 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10876 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10877 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10878 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10879 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10881 // At this point there are at most two inputs to the low and high halves from
10882 // each half. That means the inputs can always be grouped into dwords and
10883 // those dwords can then be moved to the correct half with a dword shuffle.
10884 // We use at most one low and one high word shuffle to collect these paired
10885 // inputs into dwords, and finally a dword shuffle to place them.
10886 int PSHUFLMask[4] = {-1, -1, -1, -1};
10887 int PSHUFHMask[4] = {-1, -1, -1, -1};
10888 int PSHUFDMask[4] = {-1, -1, -1, -1};
10890 // First fix the masks for all the inputs that are staying in their
10891 // original halves. This will then dictate the targets of the cross-half
10893 auto fixInPlaceInputs =
10894 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10895 MutableArrayRef<int> SourceHalfMask,
10896 MutableArrayRef<int> HalfMask, int HalfOffset) {
10897 if (InPlaceInputs.empty())
10899 if (InPlaceInputs.size() == 1) {
10900 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10901 InPlaceInputs[0] - HalfOffset;
10902 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10905 if (IncomingInputs.empty()) {
10906 // Just fix all of the in place inputs.
10907 for (int Input : InPlaceInputs) {
10908 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10909 PSHUFDMask[Input / 2] = Input / 2;
10914 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10915 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10916 InPlaceInputs[0] - HalfOffset;
10917 // Put the second input next to the first so that they are packed into
10918 // a dword. We find the adjacent index by toggling the low bit.
10919 int AdjIndex = InPlaceInputs[0] ^ 1;
10920 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10921 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10922 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10924 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10925 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10927 // Now gather the cross-half inputs and place them into a free dword of
10928 // their target half.
10929 // FIXME: This operation could almost certainly be simplified dramatically to
10930 // look more like the 3-1 fixing operation.
10931 auto moveInputsToRightHalf = [&PSHUFDMask](
10932 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10933 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10934 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10936 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10937 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10939 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10941 int LowWord = Word & ~1;
10942 int HighWord = Word | 1;
10943 return isWordClobbered(SourceHalfMask, LowWord) ||
10944 isWordClobbered(SourceHalfMask, HighWord);
10947 if (IncomingInputs.empty())
10950 if (ExistingInputs.empty()) {
10951 // Map any dwords with inputs from them into the right half.
10952 for (int Input : IncomingInputs) {
10953 // If the source half mask maps over the inputs, turn those into
10954 // swaps and use the swapped lane.
10955 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10956 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10957 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10958 Input - SourceOffset;
10959 // We have to swap the uses in our half mask in one sweep.
10960 for (int &M : HalfMask)
10961 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10963 else if (M == Input)
10964 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10966 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10967 Input - SourceOffset &&
10968 "Previous placement doesn't match!");
10970 // Note that this correctly re-maps both when we do a swap and when
10971 // we observe the other side of the swap above. We rely on that to
10972 // avoid swapping the members of the input list directly.
10973 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10976 // Map the input's dword into the correct half.
10977 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10978 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10980 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10982 "Previous placement doesn't match!");
10985 // And just directly shift any other-half mask elements to be same-half
10986 // as we will have mirrored the dword containing the element into the
10987 // same position within that half.
10988 for (int &M : HalfMask)
10989 if (M >= SourceOffset && M < SourceOffset + 4) {
10990 M = M - SourceOffset + DestOffset;
10991 assert(M >= 0 && "This should never wrap below zero!");
10996 // Ensure we have the input in a viable dword of its current half. This
10997 // is particularly tricky because the original position may be clobbered
10998 // by inputs being moved and *staying* in that half.
10999 if (IncomingInputs.size() == 1) {
11000 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11001 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11003 SourceHalfMask[InputFixed - SourceOffset] =
11004 IncomingInputs[0] - SourceOffset;
11005 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11007 IncomingInputs[0] = InputFixed;
11009 } else if (IncomingInputs.size() == 2) {
11010 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11011 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11012 // We have two non-adjacent or clobbered inputs we need to extract from
11013 // the source half. To do this, we need to map them into some adjacent
11014 // dword slot in the source mask.
11015 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11016 IncomingInputs[1] - SourceOffset};
11018 // If there is a free slot in the source half mask adjacent to one of
11019 // the inputs, place the other input in it. We use (Index XOR 1) to
11020 // compute an adjacent index.
11021 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11022 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11023 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11024 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11025 InputsFixed[1] = InputsFixed[0] ^ 1;
11026 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11027 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11028 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11029 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11030 InputsFixed[0] = InputsFixed[1] ^ 1;
11031 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11032 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11033 // The two inputs are in the same DWord but it is clobbered and the
11034 // adjacent DWord isn't used at all. Move both inputs to the free
11036 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11037 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11038 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11039 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11041 // The only way we hit this point is if there is no clobbering
11042 // (because there are no off-half inputs to this half) and there is no
11043 // free slot adjacent to one of the inputs. In this case, we have to
11044 // swap an input with a non-input.
11045 for (int i = 0; i < 4; ++i)
11046 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11047 "We can't handle any clobbers here!");
11048 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11049 "Cannot have adjacent inputs here!");
11051 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11052 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11054 // We also have to update the final source mask in this case because
11055 // it may need to undo the above swap.
11056 for (int &M : FinalSourceHalfMask)
11057 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11058 M = InputsFixed[1] + SourceOffset;
11059 else if (M == InputsFixed[1] + SourceOffset)
11060 M = (InputsFixed[0] ^ 1) + SourceOffset;
11062 InputsFixed[1] = InputsFixed[0] ^ 1;
11065 // Point everything at the fixed inputs.
11066 for (int &M : HalfMask)
11067 if (M == IncomingInputs[0])
11068 M = InputsFixed[0] + SourceOffset;
11069 else if (M == IncomingInputs[1])
11070 M = InputsFixed[1] + SourceOffset;
11072 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11073 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11076 llvm_unreachable("Unhandled input size!");
11079 // Now hoist the DWord down to the right half.
11080 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11081 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11082 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11083 for (int &M : HalfMask)
11084 for (int Input : IncomingInputs)
11086 M = FreeDWord * 2 + Input % 2;
11088 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11089 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11090 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11091 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11093 // Now enact all the shuffles we've computed to move the inputs into their
11095 if (!isNoopShuffleMask(PSHUFLMask))
11096 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11097 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11098 if (!isNoopShuffleMask(PSHUFHMask))
11099 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11100 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11101 if (!isNoopShuffleMask(PSHUFDMask))
11102 V = DAG.getBitcast(
11104 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11105 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11107 // At this point, each half should contain all its inputs, and we can then
11108 // just shuffle them into their final position.
11109 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11110 "Failed to lift all the high half inputs to the low mask!");
11111 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11112 "Failed to lift all the low half inputs to the high mask!");
11114 // Do a half shuffle for the low mask.
11115 if (!isNoopShuffleMask(LoMask))
11116 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11117 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11119 // Do a half shuffle with the high mask after shifting its values down.
11120 for (int &M : HiMask)
11123 if (!isNoopShuffleMask(HiMask))
11124 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11125 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11130 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11131 /// blend if only one input is used.
11132 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11133 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11134 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11136 SDValue V1Mask[16];
11137 SDValue V2Mask[16];
11141 int Size = Mask.size();
11142 int Scale = 16 / Size;
11143 for (int i = 0; i < 16; ++i) {
11144 if (Mask[i / Scale] < 0) {
11145 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11147 const int ZeroMask = 0x80;
11148 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11150 int V2Idx = Mask[i / Scale] < Size
11152 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11153 if (Zeroable[i / Scale])
11154 V1Idx = V2Idx = ZeroMask;
11155 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11156 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11157 V1InUse |= (ZeroMask != V1Idx);
11158 V2InUse |= (ZeroMask != V2Idx);
11163 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11164 DAG.getBitcast(MVT::v16i8, V1),
11165 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11167 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11168 DAG.getBitcast(MVT::v16i8, V2),
11169 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11171 // If we need shuffled inputs from both, blend the two.
11173 if (V1InUse && V2InUse)
11174 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11176 V = V1InUse ? V1 : V2;
11178 // Cast the result back to the correct type.
11179 return DAG.getBitcast(VT, V);
11182 /// \brief Generic lowering of 8-lane i16 shuffles.
11184 /// This handles both single-input shuffles and combined shuffle/blends with
11185 /// two inputs. The single input shuffles are immediately delegated to
11186 /// a dedicated lowering routine.
11188 /// The blends are lowered in one of three fundamental ways. If there are few
11189 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11190 /// of the input is significantly cheaper when lowered as an interleaving of
11191 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11192 /// halves of the inputs separately (making them have relatively few inputs)
11193 /// and then concatenate them.
11194 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11195 const APInt &Zeroable,
11196 SDValue V1, SDValue V2,
11197 const X86Subtarget &Subtarget,
11198 SelectionDAG &DAG) {
11199 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11200 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11201 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11203 // Whenever we can lower this as a zext, that instruction is strictly faster
11204 // than any alternative.
11205 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11206 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11209 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11211 if (NumV2Inputs == 0) {
11212 // Check for being able to broadcast a single element.
11213 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11214 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11217 // Try to use shift instructions.
11218 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11219 Zeroable, Subtarget, DAG))
11222 // Use dedicated unpack instructions for masks that match their pattern.
11224 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11227 // Try to use byte rotation instructions.
11228 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11229 Mask, Subtarget, DAG))
11232 // Make a copy of the mask so it can be modified.
11233 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11234 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11235 MutableMask, Subtarget,
11239 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11240 "All single-input shuffles should be canonicalized to be V1-input "
11243 // Try to use shift instructions.
11244 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11245 Zeroable, Subtarget, DAG))
11248 // See if we can use SSE4A Extraction / Insertion.
11249 if (Subtarget.hasSSE4A())
11250 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11254 // There are special ways we can lower some single-element blends.
11255 if (NumV2Inputs == 1)
11256 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11257 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11260 // We have different paths for blend lowering, but they all must use the
11261 // *exact* same predicate.
11262 bool IsBlendSupported = Subtarget.hasSSE41();
11263 if (IsBlendSupported)
11264 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11265 Zeroable, Subtarget, DAG))
11268 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11272 // Use dedicated unpack instructions for masks that match their pattern.
11274 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11277 // Try to use byte rotation instructions.
11278 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11279 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11282 if (SDValue BitBlend =
11283 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11286 // Try to lower by permuting the inputs into an unpack instruction.
11287 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11291 // If we can't directly blend but can use PSHUFB, that will be better as it
11292 // can both shuffle and set up the inefficient blend.
11293 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11294 bool V1InUse, V2InUse;
11295 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11296 Zeroable, DAG, V1InUse, V2InUse);
11299 // We can always bit-blend if we have to so the fallback strategy is to
11300 // decompose into single-input permutes and blends.
11301 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11305 /// \brief Check whether a compaction lowering can be done by dropping even
11306 /// elements and compute how many times even elements must be dropped.
11308 /// This handles shuffles which take every Nth element where N is a power of
11309 /// two. Example shuffle masks:
11311 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11312 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11313 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11314 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11315 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11316 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11318 /// Any of these lanes can of course be undef.
11320 /// This routine only supports N <= 3.
11321 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11324 /// \returns N above, or the number of times even elements must be dropped if
11325 /// there is such a number. Otherwise returns zero.
11326 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11327 bool IsSingleInput) {
11328 // The modulus for the shuffle vector entries is based on whether this is
11329 // a single input or not.
11330 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11331 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11332 "We should only be called with masks with a power-of-2 size!");
11334 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11336 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11337 // and 2^3 simultaneously. This is because we may have ambiguity with
11338 // partially undef inputs.
11339 bool ViableForN[3] = {true, true, true};
11341 for (int i = 0, e = Mask.size(); i < e; ++i) {
11342 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11347 bool IsAnyViable = false;
11348 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11349 if (ViableForN[j]) {
11350 uint64_t N = j + 1;
11352 // The shuffle mask must be equal to (i * 2^N) % M.
11353 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11354 IsAnyViable = true;
11356 ViableForN[j] = false;
11358 // Early exit if we exhaust the possible powers of two.
11363 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11367 // Return 0 as there is no viable power of two.
11371 /// \brief Generic lowering of v16i8 shuffles.
11373 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11374 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11375 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11376 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11378 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11379 const APInt &Zeroable,
11380 SDValue V1, SDValue V2,
11381 const X86Subtarget &Subtarget,
11382 SelectionDAG &DAG) {
11383 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11384 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11385 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11387 // Try to use shift instructions.
11388 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11389 Zeroable, Subtarget, DAG))
11392 // Try to use byte rotation instructions.
11393 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11394 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11397 // Try to use a zext lowering.
11398 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11399 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11402 // See if we can use SSE4A Extraction / Insertion.
11403 if (Subtarget.hasSSE4A())
11404 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11408 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11410 // For single-input shuffles, there are some nicer lowering tricks we can use.
11411 if (NumV2Elements == 0) {
11412 // Check for being able to broadcast a single element.
11413 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11414 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11417 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11418 // Notably, this handles splat and partial-splat shuffles more efficiently.
11419 // However, it only makes sense if the pre-duplication shuffle simplifies
11420 // things significantly. Currently, this means we need to be able to
11421 // express the pre-duplication shuffle as an i16 shuffle.
11423 // FIXME: We should check for other patterns which can be widened into an
11424 // i16 shuffle as well.
11425 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11426 for (int i = 0; i < 16; i += 2)
11427 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11432 auto tryToWidenViaDuplication = [&]() -> SDValue {
11433 if (!canWidenViaDuplication(Mask))
11435 SmallVector<int, 4> LoInputs;
11436 copy_if(Mask, std::back_inserter(LoInputs),
11437 [](int M) { return M >= 0 && M < 8; });
11438 std::sort(LoInputs.begin(), LoInputs.end());
11439 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11441 SmallVector<int, 4> HiInputs;
11442 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11443 std::sort(HiInputs.begin(), HiInputs.end());
11444 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11447 bool TargetLo = LoInputs.size() >= HiInputs.size();
11448 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11449 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11451 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11452 SmallDenseMap<int, int, 8> LaneMap;
11453 for (int I : InPlaceInputs) {
11454 PreDupI16Shuffle[I/2] = I/2;
11457 int j = TargetLo ? 0 : 4, je = j + 4;
11458 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11459 // Check if j is already a shuffle of this input. This happens when
11460 // there are two adjacent bytes after we move the low one.
11461 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11462 // If we haven't yet mapped the input, search for a slot into which
11464 while (j < je && PreDupI16Shuffle[j] >= 0)
11468 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11471 // Map this input with the i16 shuffle.
11472 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11475 // Update the lane map based on the mapping we ended up with.
11476 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11478 V1 = DAG.getBitcast(
11480 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11481 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11483 // Unpack the bytes to form the i16s that will be shuffled into place.
11484 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11485 MVT::v16i8, V1, V1);
11487 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11488 for (int i = 0; i < 16; ++i)
11489 if (Mask[i] >= 0) {
11490 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11491 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11492 if (PostDupI16Shuffle[i / 2] < 0)
11493 PostDupI16Shuffle[i / 2] = MappedMask;
11495 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11496 "Conflicting entries in the original shuffle!");
11498 return DAG.getBitcast(
11500 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11501 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11503 if (SDValue V = tryToWidenViaDuplication())
11507 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11511 // Use dedicated unpack instructions for masks that match their pattern.
11513 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11516 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11517 // with PSHUFB. It is important to do this before we attempt to generate any
11518 // blends but after all of the single-input lowerings. If the single input
11519 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11520 // want to preserve that and we can DAG combine any longer sequences into
11521 // a PSHUFB in the end. But once we start blending from multiple inputs,
11522 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11523 // and there are *very* few patterns that would actually be faster than the
11524 // PSHUFB approach because of its ability to zero lanes.
11526 // FIXME: The only exceptions to the above are blends which are exact
11527 // interleavings with direct instructions supporting them. We currently don't
11528 // handle those well here.
11529 if (Subtarget.hasSSSE3()) {
11530 bool V1InUse = false;
11531 bool V2InUse = false;
11533 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11534 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11536 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11537 // do so. This avoids using them to handle blends-with-zero which is
11538 // important as a single pshufb is significantly faster for that.
11539 if (V1InUse && V2InUse) {
11540 if (Subtarget.hasSSE41())
11541 if (SDValue Blend = lowerVectorShuffleAsBlend(
11542 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11545 // We can use an unpack to do the blending rather than an or in some
11546 // cases. Even though the or may be (very minorly) more efficient, we
11547 // preference this lowering because there are common cases where part of
11548 // the complexity of the shuffles goes away when we do the final blend as
11550 // FIXME: It might be worth trying to detect if the unpack-feeding
11551 // shuffles will both be pshufb, in which case we shouldn't bother with
11553 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11554 DL, MVT::v16i8, V1, V2, Mask, DAG))
11561 // There are special ways we can lower some single-element blends.
11562 if (NumV2Elements == 1)
11563 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11564 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11567 if (SDValue BitBlend =
11568 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11571 // Check whether a compaction lowering can be done. This handles shuffles
11572 // which take every Nth element for some even N. See the helper function for
11575 // We special case these as they can be particularly efficiently handled with
11576 // the PACKUSB instruction on x86 and they show up in common patterns of
11577 // rearranging bytes to truncate wide elements.
11578 bool IsSingleInput = V2.isUndef();
11579 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11580 // NumEvenDrops is the power of two stride of the elements. Another way of
11581 // thinking about it is that we need to drop the even elements this many
11582 // times to get the original input.
11584 // First we need to zero all the dropped bytes.
11585 assert(NumEvenDrops <= 3 &&
11586 "No support for dropping even elements more than 3 times.");
11587 // We use the mask type to pick which bytes are preserved based on how many
11588 // elements are dropped.
11589 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11590 SDValue ByteClearMask = DAG.getBitcast(
11591 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11592 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11593 if (!IsSingleInput)
11594 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11596 // Now pack things back together.
11597 V1 = DAG.getBitcast(MVT::v8i16, V1);
11598 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11599 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11600 for (int i = 1; i < NumEvenDrops; ++i) {
11601 Result = DAG.getBitcast(MVT::v8i16, Result);
11602 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11608 // Handle multi-input cases by blending single-input shuffles.
11609 if (NumV2Elements > 0)
11610 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11613 // The fallback path for single-input shuffles widens this into two v8i16
11614 // vectors with unpacks, shuffles those, and then pulls them back together
11618 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11619 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11620 for (int i = 0; i < 16; ++i)
11622 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11624 SDValue VLoHalf, VHiHalf;
11625 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11626 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11628 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11629 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11630 // Use a mask to drop the high bytes.
11631 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11632 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11633 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11635 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11636 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11638 // Squash the masks to point directly into VLoHalf.
11639 for (int &M : LoBlendMask)
11642 for (int &M : HiBlendMask)
11646 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11647 // VHiHalf so that we can blend them as i16s.
11648 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11650 VLoHalf = DAG.getBitcast(
11651 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11652 VHiHalf = DAG.getBitcast(
11653 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11656 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11657 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11659 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11662 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11664 /// This routine breaks down the specific type of 128-bit shuffle and
11665 /// dispatches to the lowering routines accordingly.
11666 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11667 MVT VT, SDValue V1, SDValue V2,
11668 const APInt &Zeroable,
11669 const X86Subtarget &Subtarget,
11670 SelectionDAG &DAG) {
11671 switch (VT.SimpleTy) {
11673 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11675 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11677 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11679 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11681 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11683 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11686 llvm_unreachable("Unimplemented!");
11690 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11692 /// This routine just extracts two subvectors, shuffles them independently, and
11693 /// then concatenates them back together. This should work effectively with all
11694 /// AVX vector shuffle types.
11695 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11696 SDValue V2, ArrayRef<int> Mask,
11697 SelectionDAG &DAG) {
11698 assert(VT.getSizeInBits() >= 256 &&
11699 "Only for 256-bit or wider vector shuffles!");
11700 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11701 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11703 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11704 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11706 int NumElements = VT.getVectorNumElements();
11707 int SplitNumElements = NumElements / 2;
11708 MVT ScalarVT = VT.getVectorElementType();
11709 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11711 // Rather than splitting build-vectors, just build two narrower build
11712 // vectors. This helps shuffling with splats and zeros.
11713 auto SplitVector = [&](SDValue V) {
11714 V = peekThroughBitcasts(V);
11716 MVT OrigVT = V.getSimpleValueType();
11717 int OrigNumElements = OrigVT.getVectorNumElements();
11718 int OrigSplitNumElements = OrigNumElements / 2;
11719 MVT OrigScalarVT = OrigVT.getVectorElementType();
11720 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11724 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11726 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11727 DAG.getIntPtrConstant(0, DL));
11728 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11729 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11732 SmallVector<SDValue, 16> LoOps, HiOps;
11733 for (int i = 0; i < OrigSplitNumElements; ++i) {
11734 LoOps.push_back(BV->getOperand(i));
11735 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11737 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11738 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11740 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11741 DAG.getBitcast(SplitVT, HiV));
11744 SDValue LoV1, HiV1, LoV2, HiV2;
11745 std::tie(LoV1, HiV1) = SplitVector(V1);
11746 std::tie(LoV2, HiV2) = SplitVector(V2);
11748 // Now create two 4-way blends of these half-width vectors.
11749 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11750 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11751 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11752 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11753 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11754 for (int i = 0; i < SplitNumElements; ++i) {
11755 int M = HalfMask[i];
11756 if (M >= NumElements) {
11757 if (M >= NumElements + SplitNumElements)
11761 V2BlendMask[i] = M - NumElements;
11762 BlendMask[i] = SplitNumElements + i;
11763 } else if (M >= 0) {
11764 if (M >= SplitNumElements)
11768 V1BlendMask[i] = M;
11773 // Because the lowering happens after all combining takes place, we need to
11774 // manually combine these blend masks as much as possible so that we create
11775 // a minimal number of high-level vector shuffle nodes.
11777 // First try just blending the halves of V1 or V2.
11778 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11779 return DAG.getUNDEF(SplitVT);
11780 if (!UseLoV2 && !UseHiV2)
11781 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11782 if (!UseLoV1 && !UseHiV1)
11783 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11785 SDValue V1Blend, V2Blend;
11786 if (UseLoV1 && UseHiV1) {
11788 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11790 // We only use half of V1 so map the usage down into the final blend mask.
11791 V1Blend = UseLoV1 ? LoV1 : HiV1;
11792 for (int i = 0; i < SplitNumElements; ++i)
11793 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11794 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11796 if (UseLoV2 && UseHiV2) {
11798 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11800 // We only use half of V2 so map the usage down into the final blend mask.
11801 V2Blend = UseLoV2 ? LoV2 : HiV2;
11802 for (int i = 0; i < SplitNumElements; ++i)
11803 if (BlendMask[i] >= SplitNumElements)
11804 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11806 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11808 SDValue Lo = HalfBlend(LoMask);
11809 SDValue Hi = HalfBlend(HiMask);
11810 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11813 /// \brief Either split a vector in halves or decompose the shuffles and the
11816 /// This is provided as a good fallback for many lowerings of non-single-input
11817 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11818 /// between splitting the shuffle into 128-bit components and stitching those
11819 /// back together vs. extracting the single-input shuffles and blending those
11821 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11822 SDValue V1, SDValue V2,
11823 ArrayRef<int> Mask,
11824 SelectionDAG &DAG) {
11825 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11826 "shuffles as it could then recurse on itself.");
11827 int Size = Mask.size();
11829 // If this can be modeled as a broadcast of two elements followed by a blend,
11830 // prefer that lowering. This is especially important because broadcasts can
11831 // often fold with memory operands.
11832 auto DoBothBroadcast = [&] {
11833 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11836 if (V2BroadcastIdx < 0)
11837 V2BroadcastIdx = M - Size;
11838 else if (M - Size != V2BroadcastIdx)
11840 } else if (M >= 0) {
11841 if (V1BroadcastIdx < 0)
11842 V1BroadcastIdx = M;
11843 else if (M != V1BroadcastIdx)
11848 if (DoBothBroadcast())
11849 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11852 // If the inputs all stem from a single 128-bit lane of each input, then we
11853 // split them rather than blending because the split will decompose to
11854 // unusually few instructions.
11855 int LaneCount = VT.getSizeInBits() / 128;
11856 int LaneSize = Size / LaneCount;
11857 SmallBitVector LaneInputs[2];
11858 LaneInputs[0].resize(LaneCount, false);
11859 LaneInputs[1].resize(LaneCount, false);
11860 for (int i = 0; i < Size; ++i)
11862 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11863 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11864 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11866 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11867 // that the decomposed single-input shuffles don't end up here.
11868 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11871 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11872 /// a permutation and blend of those lanes.
11874 /// This essentially blends the out-of-lane inputs to each lane into the lane
11875 /// from a permuted copy of the vector. This lowering strategy results in four
11876 /// instructions in the worst case for a single-input cross lane shuffle which
11877 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11878 /// of. Special cases for each particular shuffle pattern should be handled
11879 /// prior to trying this lowering.
11880 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11881 SDValue V1, SDValue V2,
11882 ArrayRef<int> Mask,
11883 SelectionDAG &DAG) {
11884 // FIXME: This should probably be generalized for 512-bit vectors as well.
11885 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11886 int Size = Mask.size();
11887 int LaneSize = Size / 2;
11889 // If there are only inputs from one 128-bit lane, splitting will in fact be
11890 // less expensive. The flags track whether the given lane contains an element
11891 // that crosses to another lane.
11892 bool LaneCrossing[2] = {false, false};
11893 for (int i = 0; i < Size; ++i)
11894 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11895 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11896 if (!LaneCrossing[0] || !LaneCrossing[1])
11897 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11899 assert(V2.isUndef() &&
11900 "This last part of this routine only works on single input shuffles");
11902 SmallVector<int, 32> FlippedBlendMask(Size);
11903 for (int i = 0; i < Size; ++i)
11904 FlippedBlendMask[i] =
11905 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11907 : Mask[i] % LaneSize +
11908 (i / LaneSize) * LaneSize + Size);
11910 // Flip the vector, and blend the results which should now be in-lane. The
11911 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11912 // 5 for the high source. The value 3 selects the high half of source 2 and
11913 // the value 2 selects the low half of source 2. We only use source 2 to
11914 // allow folding it into a memory operand.
11915 unsigned PERMMask = 3 | 2 << 4;
11916 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11917 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11918 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11921 /// \brief Handle lowering 2-lane 128-bit shuffles.
11922 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11923 SDValue V2, ArrayRef<int> Mask,
11924 const APInt &Zeroable,
11925 const X86Subtarget &Subtarget,
11926 SelectionDAG &DAG) {
11927 SmallVector<int, 4> WidenedMask;
11928 if (!canWidenShuffleElements(Mask, WidenedMask))
11931 // TODO: If minimizing size and one of the inputs is a zero vector and the
11932 // the zero vector has only one use, we could use a VPERM2X128 to save the
11933 // instruction bytes needed to explicitly generate the zero vector.
11935 // Blends are faster and handle all the non-lane-crossing cases.
11936 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11937 Zeroable, Subtarget, DAG))
11940 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11941 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11943 // If either input operand is a zero vector, use VPERM2X128 because its mask
11944 // allows us to replace the zero input with an implicit zero.
11945 if (!IsV1Zero && !IsV2Zero) {
11946 // Check for patterns which can be matched with a single insert of a 128-bit
11948 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11949 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11950 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11951 if (Subtarget.hasAVX2() && V2.isUndef())
11954 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11955 VT.getVectorNumElements() / 2);
11956 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11957 DAG.getIntPtrConstant(0, DL));
11958 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11959 OnlyUsesV1 ? V1 : V2,
11960 DAG.getIntPtrConstant(0, DL));
11961 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11965 // Otherwise form a 128-bit permutation. After accounting for undefs,
11966 // convert the 64-bit shuffle mask selection values into 128-bit
11967 // selection bits by dividing the indexes by 2 and shifting into positions
11968 // defined by a vperm2*128 instruction's immediate control byte.
11970 // The immediate permute control byte looks like this:
11971 // [1:0] - select 128 bits from sources for low half of destination
11973 // [3] - zero low half of destination
11974 // [5:4] - select 128 bits from sources for high half of destination
11976 // [7] - zero high half of destination
11978 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11979 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11981 unsigned PermMask = MaskLO | (MaskHI << 4);
11983 // If either input is a zero vector, replace it with an undef input.
11984 // Shuffle mask values < 4 are selecting elements of V1.
11985 // Shuffle mask values >= 4 are selecting elements of V2.
11986 // Adjust each half of the permute mask by clearing the half that was
11987 // selecting the zero vector and setting the zero mask bit.
11989 V1 = DAG.getUNDEF(VT);
11991 PermMask = (PermMask & 0xf0) | 0x08;
11993 PermMask = (PermMask & 0x0f) | 0x80;
11996 V2 = DAG.getUNDEF(VT);
11998 PermMask = (PermMask & 0xf0) | 0x08;
12000 PermMask = (PermMask & 0x0f) | 0x80;
12003 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12004 DAG.getConstant(PermMask, DL, MVT::i8));
12007 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12008 /// shuffling each lane.
12010 /// This will only succeed when the result of fixing the 128-bit lanes results
12011 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12012 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12013 /// the lane crosses early and then use simpler shuffles within each lane.
12015 /// FIXME: It might be worthwhile at some point to support this without
12016 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12017 /// in x86 only floating point has interesting non-repeating shuffles, and even
12018 /// those are still *marginally* more expensive.
12019 static SDValue lowerVectorShuffleByMerging128BitLanes(
12020 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12021 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12022 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12024 int Size = Mask.size();
12025 int LaneSize = 128 / VT.getScalarSizeInBits();
12026 int NumLanes = Size / LaneSize;
12027 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12029 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12030 // check whether the in-128-bit lane shuffles share a repeating pattern.
12031 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12032 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12033 for (int i = 0; i < Size; ++i) {
12037 int j = i / LaneSize;
12039 if (Lanes[j] < 0) {
12040 // First entry we've seen for this lane.
12041 Lanes[j] = Mask[i] / LaneSize;
12042 } else if (Lanes[j] != Mask[i] / LaneSize) {
12043 // This doesn't match the lane selected previously!
12047 // Check that within each lane we have a consistent shuffle mask.
12048 int k = i % LaneSize;
12049 if (InLaneMask[k] < 0) {
12050 InLaneMask[k] = Mask[i] % LaneSize;
12051 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12052 // This doesn't fit a repeating in-lane mask.
12057 // First shuffle the lanes into place.
12058 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12059 VT.getSizeInBits() / 64);
12060 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12061 for (int i = 0; i < NumLanes; ++i)
12062 if (Lanes[i] >= 0) {
12063 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12064 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12067 V1 = DAG.getBitcast(LaneVT, V1);
12068 V2 = DAG.getBitcast(LaneVT, V2);
12069 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12071 // Cast it back to the type we actually want.
12072 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12074 // Now do a simple shuffle that isn't lane crossing.
12075 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12076 for (int i = 0; i < Size; ++i)
12078 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12079 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12080 "Must not introduce lane crosses at this point!");
12082 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12085 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12086 /// This allows for fast cases such as subvector extraction/insertion
12087 /// or shuffling smaller vector types which can lower more efficiently.
12088 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12089 SDValue V1, SDValue V2,
12090 ArrayRef<int> Mask,
12091 const X86Subtarget &Subtarget,
12092 SelectionDAG &DAG) {
12093 assert(VT.is256BitVector() && "Expected 256-bit vector");
12095 unsigned NumElts = VT.getVectorNumElements();
12096 unsigned HalfNumElts = NumElts / 2;
12097 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12099 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12100 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12101 if (!UndefLower && !UndefUpper)
12104 // Upper half is undef and lower half is whole upper subvector.
12105 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12107 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12108 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12109 DAG.getIntPtrConstant(HalfNumElts, DL));
12110 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12111 DAG.getIntPtrConstant(0, DL));
12114 // Lower half is undef and upper half is whole lower subvector.
12115 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12117 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12118 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12119 DAG.getIntPtrConstant(0, DL));
12120 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12121 DAG.getIntPtrConstant(HalfNumElts, DL));
12124 // If the shuffle only uses two of the four halves of the input operands,
12125 // then extract them and perform the 'half' shuffle at half width.
12126 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12127 int HalfIdx1 = -1, HalfIdx2 = -1;
12128 SmallVector<int, 8> HalfMask(HalfNumElts);
12129 unsigned Offset = UndefLower ? HalfNumElts : 0;
12130 for (unsigned i = 0; i != HalfNumElts; ++i) {
12131 int M = Mask[i + Offset];
12137 // Determine which of the 4 half vectors this element is from.
12138 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12139 int HalfIdx = M / HalfNumElts;
12141 // Determine the element index into its half vector source.
12142 int HalfElt = M % HalfNumElts;
12144 // We can shuffle with up to 2 half vectors, set the new 'half'
12145 // shuffle mask accordingly.
12146 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12147 HalfMask[i] = HalfElt;
12148 HalfIdx1 = HalfIdx;
12151 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12152 HalfMask[i] = HalfElt + HalfNumElts;
12153 HalfIdx2 = HalfIdx;
12157 // Too many half vectors referenced.
12160 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12162 // Only shuffle the halves of the inputs when useful.
12163 int NumLowerHalves =
12164 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12165 int NumUpperHalves =
12166 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12168 // uuuuXXXX - don't extract uppers just to insert again.
12169 if (UndefLower && NumUpperHalves != 0)
12172 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12173 if (UndefUpper && NumUpperHalves == 2)
12176 // AVX2 - XXXXuuuu - always extract lowers.
12177 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12178 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12179 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12181 // AVX2 supports variable 32-bit element cross-lane shuffles.
12182 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12183 // XXXXuuuu - don't extract lowers and uppers.
12184 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12189 auto GetHalfVector = [&](int HalfIdx) {
12191 return DAG.getUNDEF(HalfVT);
12192 SDValue V = (HalfIdx < 2 ? V1 : V2);
12193 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12194 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12195 DAG.getIntPtrConstant(HalfIdx, DL));
12198 SDValue Half1 = GetHalfVector(HalfIdx1);
12199 SDValue Half2 = GetHalfVector(HalfIdx2);
12200 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12201 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12202 DAG.getIntPtrConstant(Offset, DL));
12205 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12208 /// This returns true if the elements from a particular input are already in the
12209 /// slot required by the given mask and require no permutation.
12210 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12211 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12212 int Size = Mask.size();
12213 for (int i = 0; i < Size; ++i)
12214 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12220 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12221 /// every lane can be represented as the same repeating mask - allowing us to
12222 /// shuffle the sources with the repeating shuffle and then permute the result
12223 /// to the destination lanes.
12224 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12225 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12226 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12227 int NumElts = VT.getVectorNumElements();
12228 int NumLanes = VT.getSizeInBits() / 128;
12229 int NumLaneElts = NumElts / NumLanes;
12231 // On AVX2 we may be able to just shuffle the lowest elements and then
12232 // broadcast the result.
12233 if (Subtarget.hasAVX2()) {
12234 for (unsigned BroadcastSize : {16, 32, 64}) {
12235 if (BroadcastSize <= VT.getScalarSizeInBits())
12237 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12239 // Attempt to match a repeating pattern every NumBroadcastElts,
12240 // accounting for UNDEFs but only references the lowest 128-bit
12241 // lane of the inputs.
12242 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12243 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12244 for (int j = 0; j != NumBroadcastElts; ++j) {
12245 int M = Mask[i + j];
12248 int &R = RepeatMask[j];
12249 if (0 != ((M % NumElts) / NumLaneElts))
12251 if (0 <= R && R != M)
12258 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12259 if (!FindRepeatingBroadcastMask(RepeatMask))
12262 // Shuffle the (lowest) repeated elements in place for broadcast.
12263 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12265 // Shuffle the actual broadcast.
12266 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12267 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12268 for (int j = 0; j != NumBroadcastElts; ++j)
12269 BroadcastMask[i + j] = j;
12270 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12275 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12276 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12279 // Bail if we already have a repeated lane shuffle mask.
12280 SmallVector<int, 8> RepeatedShuffleMask;
12281 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12284 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12285 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12286 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12287 int NumSubLanes = NumLanes * SubLaneScale;
12288 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12290 // Check that all the sources are coming from the same lane and see if we can
12291 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12292 // determine the source sub-lane for each destination sub-lane.
12293 int TopSrcSubLane = -1;
12294 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12295 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12296 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12297 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12299 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12300 // Extract the sub-lane mask, check that it all comes from the same lane
12301 // and normalize the mask entries to come from the first lane.
12303 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12304 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12305 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12308 int Lane = (M % NumElts) / NumLaneElts;
12309 if ((0 <= SrcLane) && (SrcLane != Lane))
12312 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12313 SubLaneMask[Elt] = LocalM;
12316 // Whole sub-lane is UNDEF.
12320 // Attempt to match against the candidate repeated sub-lane masks.
12321 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12322 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12323 for (int i = 0; i != NumSubLaneElts; ++i) {
12324 if (M1[i] < 0 || M2[i] < 0)
12326 if (M1[i] != M2[i])
12332 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12333 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12336 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12337 for (int i = 0; i != NumSubLaneElts; ++i) {
12338 int M = SubLaneMask[i];
12341 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12342 "Unexpected mask element");
12343 RepeatedSubLaneMask[i] = M;
12346 // Track the top most source sub-lane - by setting the remaining to UNDEF
12347 // we can greatly simplify shuffle matching.
12348 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12349 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12350 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12354 // Bail if we failed to find a matching repeated sub-lane mask.
12355 if (Dst2SrcSubLanes[DstSubLane] < 0)
12358 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12359 "Unexpected source lane");
12361 // Create a repeating shuffle mask for the entire vector.
12362 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12363 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12364 int Lane = SubLane / SubLaneScale;
12365 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12366 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12367 int M = RepeatedSubLaneMask[Elt];
12370 int Idx = (SubLane * NumSubLaneElts) + Elt;
12371 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12374 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12376 // Shuffle each source sub-lane to its destination.
12377 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12378 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12379 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12380 if (SrcSubLane < 0)
12382 for (int j = 0; j != NumSubLaneElts; ++j)
12383 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12386 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12390 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12391 unsigned &ShuffleImm,
12392 ArrayRef<int> Mask) {
12393 int NumElts = VT.getVectorNumElements();
12394 assert(VT.getScalarSizeInBits() == 64 &&
12395 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12396 "Unexpected data type for VSHUFPD");
12398 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12399 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12401 bool ShufpdMask = true;
12402 bool CommutableMask = true;
12403 for (int i = 0; i < NumElts; ++i) {
12404 if (Mask[i] == SM_SentinelUndef)
12408 int Val = (i & 6) + NumElts * (i & 1);
12409 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12410 if (Mask[i] < Val || Mask[i] > Val + 1)
12411 ShufpdMask = false;
12412 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12413 CommutableMask = false;
12414 ShuffleImm |= (Mask[i] % 2) << i;
12419 if (CommutableMask) {
12427 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12428 ArrayRef<int> Mask, SDValue V1,
12429 SDValue V2, SelectionDAG &DAG) {
12430 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12431 "Unexpected data type for VSHUFPD");
12433 unsigned Immediate = 0;
12434 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12437 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12438 DAG.getConstant(Immediate, DL, MVT::i8));
12441 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12442 ArrayRef<int> Mask, SDValue V1,
12443 SDValue V2, SelectionDAG &DAG) {
12444 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12445 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12447 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12449 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12451 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12454 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12456 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12457 /// isn't available.
12458 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12459 const APInt &Zeroable,
12460 SDValue V1, SDValue V2,
12461 const X86Subtarget &Subtarget,
12462 SelectionDAG &DAG) {
12463 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12464 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12465 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12467 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12468 Zeroable, Subtarget, DAG))
12471 if (V2.isUndef()) {
12472 // Check for being able to broadcast a single element.
12473 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12474 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12477 // Use low duplicate instructions for masks that match their pattern.
12478 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12479 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12481 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12482 // Non-half-crossing single input shuffles can be lowered with an
12483 // interleaved permutation.
12484 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12485 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12486 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12487 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12490 // With AVX2 we have direct support for this permutation.
12491 if (Subtarget.hasAVX2())
12492 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12493 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12495 // Try to create an in-lane repeating shuffle mask and then shuffle the
12496 // the results into the target lanes.
12497 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12498 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12501 // Otherwise, fall back.
12502 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12506 // Use dedicated unpack instructions for masks that match their pattern.
12508 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12511 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12512 Zeroable, Subtarget, DAG))
12515 // Check if the blend happens to exactly fit that of SHUFPD.
12517 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12520 // Try to create an in-lane repeating shuffle mask and then shuffle the
12521 // the results into the target lanes.
12522 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12523 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12526 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12527 // shuffle. However, if we have AVX2 and either inputs are already in place,
12528 // we will be able to shuffle even across lanes the other input in a single
12529 // instruction so skip this pattern.
12530 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12531 isShuffleMaskInputInPlace(1, Mask))))
12532 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12533 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12535 // If we have VLX support, we can use VEXPAND.
12536 if (Subtarget.hasVLX())
12537 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12538 V1, V2, DAG, Subtarget))
12541 // If we have AVX2 then we always want to lower with a blend because an v4 we
12542 // can fully permute the elements.
12543 if (Subtarget.hasAVX2())
12544 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12547 // Otherwise fall back on generic lowering.
12548 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12551 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12553 /// This routine is only called when we have AVX2 and thus a reasonable
12554 /// instruction set for v4i64 shuffling..
12555 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12556 const APInt &Zeroable,
12557 SDValue V1, SDValue V2,
12558 const X86Subtarget &Subtarget,
12559 SelectionDAG &DAG) {
12560 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12561 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12562 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12563 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12565 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12566 Zeroable, Subtarget, DAG))
12569 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12570 Zeroable, Subtarget, DAG))
12573 // Check for being able to broadcast a single element.
12574 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12575 Mask, Subtarget, DAG))
12578 if (V2.isUndef()) {
12579 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12580 // can use lower latency instructions that will operate on both lanes.
12581 SmallVector<int, 2> RepeatedMask;
12582 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12583 SmallVector<int, 4> PSHUFDMask;
12584 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12585 return DAG.getBitcast(
12587 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12588 DAG.getBitcast(MVT::v8i32, V1),
12589 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12592 // AVX2 provides a direct instruction for permuting a single input across
12594 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12595 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12598 // Try to use shift instructions.
12599 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12600 Zeroable, Subtarget, DAG))
12603 // If we have VLX support, we can use VALIGN or VEXPAND.
12604 if (Subtarget.hasVLX()) {
12605 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12606 Mask, Subtarget, DAG))
12609 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12610 V1, V2, DAG, Subtarget))
12614 // Try to use PALIGNR.
12615 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12616 Mask, Subtarget, DAG))
12619 // Use dedicated unpack instructions for masks that match their pattern.
12621 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12624 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12625 // shuffle. However, if we have AVX2 and either inputs are already in place,
12626 // we will be able to shuffle even across lanes the other input in a single
12627 // instruction so skip this pattern.
12628 if (!isShuffleMaskInputInPlace(0, Mask) &&
12629 !isShuffleMaskInputInPlace(1, Mask))
12630 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12631 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12634 // Otherwise fall back on generic blend lowering.
12635 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12639 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12641 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12642 /// isn't available.
12643 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12644 const APInt &Zeroable,
12645 SDValue V1, SDValue V2,
12646 const X86Subtarget &Subtarget,
12647 SelectionDAG &DAG) {
12648 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12649 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12650 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12652 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12653 Zeroable, Subtarget, DAG))
12656 // Check for being able to broadcast a single element.
12657 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12658 Mask, Subtarget, DAG))
12661 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12662 // options to efficiently lower the shuffle.
12663 SmallVector<int, 4> RepeatedMask;
12664 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12665 assert(RepeatedMask.size() == 4 &&
12666 "Repeated masks must be half the mask width!");
12668 // Use even/odd duplicate instructions for masks that match their pattern.
12669 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12670 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12671 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12672 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12675 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12676 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12678 // Use dedicated unpack instructions for masks that match their pattern.
12680 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12683 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12684 // have already handled any direct blends.
12685 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12688 // Try to create an in-lane repeating shuffle mask and then shuffle the
12689 // the results into the target lanes.
12690 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12691 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12694 // If we have a single input shuffle with different shuffle patterns in the
12695 // two 128-bit lanes use the variable mask to VPERMILPS.
12696 if (V2.isUndef()) {
12697 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12698 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12699 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12701 if (Subtarget.hasAVX2())
12702 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12704 // Otherwise, fall back.
12705 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12709 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12711 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12712 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12714 // If we have VLX support, we can use VEXPAND.
12715 if (Subtarget.hasVLX())
12716 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12717 V1, V2, DAG, Subtarget))
12720 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12721 // since after split we get a more efficient code using vpunpcklwd and
12722 // vpunpckhwd instrs than vblend.
12723 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12724 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12728 // If we have AVX2 then we always want to lower with a blend because at v8 we
12729 // can fully permute the elements.
12730 if (Subtarget.hasAVX2())
12731 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12734 // Otherwise fall back on generic lowering.
12735 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12738 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12740 /// This routine is only called when we have AVX2 and thus a reasonable
12741 /// instruction set for v8i32 shuffling..
12742 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12743 const APInt &Zeroable,
12744 SDValue V1, SDValue V2,
12745 const X86Subtarget &Subtarget,
12746 SelectionDAG &DAG) {
12747 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12748 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12749 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12750 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12752 // Whenever we can lower this as a zext, that instruction is strictly faster
12753 // than any alternative. It also allows us to fold memory operands into the
12754 // shuffle in many cases.
12755 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12756 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12759 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12760 // since after split we get a more efficient code than vblend by using
12761 // vpunpcklwd and vpunpckhwd instrs.
12762 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12763 !Subtarget.hasAVX512())
12765 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12768 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12769 Zeroable, Subtarget, DAG))
12772 // Check for being able to broadcast a single element.
12773 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12774 Mask, Subtarget, DAG))
12777 // If the shuffle mask is repeated in each 128-bit lane we can use more
12778 // efficient instructions that mirror the shuffles across the two 128-bit
12780 SmallVector<int, 4> RepeatedMask;
12781 bool Is128BitLaneRepeatedShuffle =
12782 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12783 if (Is128BitLaneRepeatedShuffle) {
12784 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12786 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12787 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12789 // Use dedicated unpack instructions for masks that match their pattern.
12791 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12795 // Try to use shift instructions.
12796 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12797 Zeroable, Subtarget, DAG))
12800 // If we have VLX support, we can use VALIGN or EXPAND.
12801 if (Subtarget.hasVLX()) {
12802 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12803 Mask, Subtarget, DAG))
12806 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12807 V1, V2, DAG, Subtarget))
12811 // Try to use byte rotation instructions.
12812 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12813 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12816 // Try to create an in-lane repeating shuffle mask and then shuffle the
12817 // results into the target lanes.
12818 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12819 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12822 // If the shuffle patterns aren't repeated but it is a single input, directly
12823 // generate a cross-lane VPERMD instruction.
12824 if (V2.isUndef()) {
12825 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12826 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12829 // Assume that a single SHUFPS is faster than an alternative sequence of
12830 // multiple instructions (even if the CPU has a domain penalty).
12831 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12832 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12833 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12834 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12835 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12836 CastV1, CastV2, DAG);
12837 return DAG.getBitcast(MVT::v8i32, ShufPS);
12840 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12842 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12843 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12846 // Otherwise fall back on generic blend lowering.
12847 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12851 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12853 /// This routine is only called when we have AVX2 and thus a reasonable
12854 /// instruction set for v16i16 shuffling..
12855 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12856 const APInt &Zeroable,
12857 SDValue V1, SDValue V2,
12858 const X86Subtarget &Subtarget,
12859 SelectionDAG &DAG) {
12860 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12861 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12862 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12863 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12865 // Whenever we can lower this as a zext, that instruction is strictly faster
12866 // than any alternative. It also allows us to fold memory operands into the
12867 // shuffle in many cases.
12868 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12869 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12872 // Check for being able to broadcast a single element.
12873 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12874 Mask, Subtarget, DAG))
12877 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12878 Zeroable, Subtarget, DAG))
12881 // Use dedicated unpack instructions for masks that match their pattern.
12883 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12886 // Try to use shift instructions.
12887 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12888 Zeroable, Subtarget, DAG))
12891 // Try to use byte rotation instructions.
12892 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12893 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12896 // Try to create an in-lane repeating shuffle mask and then shuffle the
12897 // the results into the target lanes.
12898 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12899 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12902 if (V2.isUndef()) {
12903 // There are no generalized cross-lane shuffle operations available on i16
12905 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12906 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12909 SmallVector<int, 8> RepeatedMask;
12910 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12911 // As this is a single-input shuffle, the repeated mask should be
12912 // a strictly valid v8i16 mask that we can pass through to the v8i16
12913 // lowering to handle even the v16 case.
12914 return lowerV8I16GeneralSingleInputVectorShuffle(
12915 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12919 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12920 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12923 // AVX512BWVL can lower to VPERMW.
12924 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12925 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12927 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12929 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12930 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12933 // Otherwise fall back on generic lowering.
12934 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12937 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12939 /// This routine is only called when we have AVX2 and thus a reasonable
12940 /// instruction set for v32i8 shuffling..
12941 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12942 const APInt &Zeroable,
12943 SDValue V1, SDValue V2,
12944 const X86Subtarget &Subtarget,
12945 SelectionDAG &DAG) {
12946 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12947 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12948 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12949 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12951 // Whenever we can lower this as a zext, that instruction is strictly faster
12952 // than any alternative. It also allows us to fold memory operands into the
12953 // shuffle in many cases.
12954 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12955 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12958 // Check for being able to broadcast a single element.
12959 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12960 Mask, Subtarget, DAG))
12963 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12964 Zeroable, Subtarget, DAG))
12967 // Use dedicated unpack instructions for masks that match their pattern.
12969 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12972 // Try to use shift instructions.
12973 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12974 Zeroable, Subtarget, DAG))
12977 // Try to use byte rotation instructions.
12978 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12979 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12982 // Try to create an in-lane repeating shuffle mask and then shuffle the
12983 // the results into the target lanes.
12984 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12985 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12988 // There are no generalized cross-lane shuffle operations available on i8
12990 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12991 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
12994 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12995 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12998 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13000 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13001 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13004 // Otherwise fall back on generic lowering.
13005 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13008 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13010 /// This routine either breaks down the specific type of a 256-bit x86 vector
13011 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13012 /// together based on the available instructions.
13013 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13014 MVT VT, SDValue V1, SDValue V2,
13015 const APInt &Zeroable,
13016 const X86Subtarget &Subtarget,
13017 SelectionDAG &DAG) {
13018 // If we have a single input to the zero element, insert that into V1 if we
13019 // can do so cheaply.
13020 int NumElts = VT.getVectorNumElements();
13021 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13023 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13024 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13025 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13028 // Handle special cases where the lower or upper half is UNDEF.
13030 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13033 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13034 // can check for those subtargets here and avoid much of the subtarget
13035 // querying in the per-vector-type lowering routines. With AVX1 we have
13036 // essentially *zero* ability to manipulate a 256-bit vector with integer
13037 // types. Since we'll use floating point types there eventually, just
13038 // immediately cast everything to a float and operate entirely in that domain.
13039 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13040 int ElementBits = VT.getScalarSizeInBits();
13041 if (ElementBits < 32) {
13042 // No floating point type available, if we can't use the bit operations
13043 // for masking/blending then decompose into 128-bit vectors.
13045 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13047 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13049 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13052 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13053 VT.getVectorNumElements());
13054 V1 = DAG.getBitcast(FpVT, V1);
13055 V2 = DAG.getBitcast(FpVT, V2);
13056 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13059 switch (VT.SimpleTy) {
13061 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13063 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13065 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13067 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13069 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13071 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13074 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13078 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13079 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13080 ArrayRef<int> Mask, SDValue V1,
13081 SDValue V2, SelectionDAG &DAG) {
13082 assert(VT.getScalarSizeInBits() == 64 &&
13083 "Unexpected element type size for 128bit shuffle.");
13085 // To handle 256 bit vector requires VLX and most probably
13086 // function lowerV2X128VectorShuffle() is better solution.
13087 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13089 SmallVector<int, 4> WidenedMask;
13090 if (!canWidenShuffleElements(Mask, WidenedMask))
13093 // Check for patterns which can be matched with a single insert of a 256-bit
13095 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13096 {0, 1, 2, 3, 0, 1, 2, 3});
13097 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13098 {0, 1, 2, 3, 8, 9, 10, 11})) {
13099 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13100 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13101 DAG.getIntPtrConstant(0, DL));
13102 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13103 OnlyUsesV1 ? V1 : V2,
13104 DAG.getIntPtrConstant(0, DL));
13105 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13108 assert(WidenedMask.size() == 4);
13110 // See if this is an insertion of the lower 128-bits of V2 into V1.
13111 bool IsInsert = true;
13113 for (int i = 0; i < 4; ++i) {
13114 assert(WidenedMask[i] >= -1);
13115 if (WidenedMask[i] < 0)
13118 // Make sure all V1 subvectors are in place.
13119 if (WidenedMask[i] < 4) {
13120 if (WidenedMask[i] != i) {
13125 // Make sure we only have a single V2 index and its the lowest 128-bits.
13126 if (V2Index >= 0 || WidenedMask[i] != 4) {
13133 if (IsInsert && V2Index >= 0) {
13134 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13135 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13136 DAG.getIntPtrConstant(0, DL));
13137 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13140 // Try to lower to to vshuf64x2/vshuf32x4.
13141 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13142 unsigned PermMask = 0;
13143 // Insure elements came from the same Op.
13144 for (int i = 0; i < 4; ++i) {
13145 assert(WidenedMask[i] >= -1);
13146 if (WidenedMask[i] < 0)
13149 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13150 unsigned OpIndex = i / 2;
13151 if (Ops[OpIndex].isUndef())
13153 else if (Ops[OpIndex] != Op)
13156 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13157 // bits defined by a vshuf64x2 instruction's immediate control byte.
13158 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13161 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13162 DAG.getConstant(PermMask, DL, MVT::i8));
13165 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13166 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13167 const APInt &Zeroable,
13168 SDValue V1, SDValue V2,
13169 const X86Subtarget &Subtarget,
13170 SelectionDAG &DAG) {
13171 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13172 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13173 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13175 if (V2.isUndef()) {
13176 // Use low duplicate instructions for masks that match their pattern.
13177 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13178 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13180 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13181 // Non-half-crossing single input shuffles can be lowered with an
13182 // interleaved permutation.
13183 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13184 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13185 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13186 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13187 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13188 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13191 SmallVector<int, 4> RepeatedMask;
13192 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13193 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13194 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13197 if (SDValue Shuf128 =
13198 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13201 if (SDValue Unpck =
13202 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13205 // Check if the blend happens to exactly fit that of SHUFPD.
13207 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13210 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13211 V2, DAG, Subtarget))
13214 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13215 Zeroable, Subtarget, DAG))
13218 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13221 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13222 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13223 const APInt &Zeroable,
13224 SDValue V1, SDValue V2,
13225 const X86Subtarget &Subtarget,
13226 SelectionDAG &DAG) {
13227 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13228 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13229 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13231 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13232 // options to efficiently lower the shuffle.
13233 SmallVector<int, 4> RepeatedMask;
13234 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13235 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13237 // Use even/odd duplicate instructions for masks that match their pattern.
13238 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13239 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13240 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13241 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13244 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13245 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13247 // Use dedicated unpack instructions for masks that match their pattern.
13248 if (SDValue Unpck =
13249 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13252 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13253 Zeroable, Subtarget, DAG))
13256 // Otherwise, fall back to a SHUFPS sequence.
13257 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13259 // If we have AVX512F support, we can use VEXPAND.
13260 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13261 V1, V2, DAG, Subtarget))
13264 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13267 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13268 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13269 const APInt &Zeroable,
13270 SDValue V1, SDValue V2,
13271 const X86Subtarget &Subtarget,
13272 SelectionDAG &DAG) {
13273 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13274 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13275 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13277 if (SDValue Shuf128 =
13278 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13281 if (V2.isUndef()) {
13282 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13283 // can use lower latency instructions that will operate on all four
13285 SmallVector<int, 2> Repeated128Mask;
13286 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13287 SmallVector<int, 4> PSHUFDMask;
13288 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13289 return DAG.getBitcast(
13291 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13292 DAG.getBitcast(MVT::v16i32, V1),
13293 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13296 SmallVector<int, 4> Repeated256Mask;
13297 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13298 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13299 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13302 // Try to use shift instructions.
13303 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13304 Zeroable, Subtarget, DAG))
13307 // Try to use VALIGN.
13308 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13309 Mask, Subtarget, DAG))
13312 // Try to use PALIGNR.
13313 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13314 Mask, Subtarget, DAG))
13317 if (SDValue Unpck =
13318 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13320 // If we have AVX512F support, we can use VEXPAND.
13321 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13322 V2, DAG, Subtarget))
13325 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13326 Zeroable, Subtarget, DAG))
13329 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13332 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13333 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13334 const APInt &Zeroable,
13335 SDValue V1, SDValue V2,
13336 const X86Subtarget &Subtarget,
13337 SelectionDAG &DAG) {
13338 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13339 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13340 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13342 // Whenever we can lower this as a zext, that instruction is strictly faster
13343 // than any alternative. It also allows us to fold memory operands into the
13344 // shuffle in many cases.
13345 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13346 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13349 // If the shuffle mask is repeated in each 128-bit lane we can use more
13350 // efficient instructions that mirror the shuffles across the four 128-bit
13352 SmallVector<int, 4> RepeatedMask;
13353 bool Is128BitLaneRepeatedShuffle =
13354 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13355 if (Is128BitLaneRepeatedShuffle) {
13356 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13358 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13359 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13361 // Use dedicated unpack instructions for masks that match their pattern.
13363 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13367 // Try to use shift instructions.
13368 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13369 Zeroable, Subtarget, DAG))
13372 // Try to use VALIGN.
13373 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13374 Mask, Subtarget, DAG))
13377 // Try to use byte rotation instructions.
13378 if (Subtarget.hasBWI())
13379 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13380 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13383 // Assume that a single SHUFPS is faster than using a permv shuffle.
13384 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13385 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13386 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13387 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13388 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13389 CastV1, CastV2, DAG);
13390 return DAG.getBitcast(MVT::v16i32, ShufPS);
13392 // If we have AVX512F support, we can use VEXPAND.
13393 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13394 V1, V2, DAG, Subtarget))
13397 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13398 Zeroable, Subtarget, DAG))
13400 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13403 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13404 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13405 const APInt &Zeroable,
13406 SDValue V1, SDValue V2,
13407 const X86Subtarget &Subtarget,
13408 SelectionDAG &DAG) {
13409 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13410 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13411 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13412 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13414 // Whenever we can lower this as a zext, that instruction is strictly faster
13415 // than any alternative. It also allows us to fold memory operands into the
13416 // shuffle in many cases.
13417 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13418 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13421 // Use dedicated unpack instructions for masks that match their pattern.
13423 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13426 // Try to use shift instructions.
13427 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13428 Zeroable, Subtarget, DAG))
13431 // Try to use byte rotation instructions.
13432 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13433 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13436 if (V2.isUndef()) {
13437 SmallVector<int, 8> RepeatedMask;
13438 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13439 // As this is a single-input shuffle, the repeated mask should be
13440 // a strictly valid v8i16 mask that we can pass through to the v8i16
13441 // lowering to handle even the v32 case.
13442 return lowerV8I16GeneralSingleInputVectorShuffle(
13443 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13447 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13448 Zeroable, Subtarget, DAG))
13451 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13454 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13455 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13456 const APInt &Zeroable,
13457 SDValue V1, SDValue V2,
13458 const X86Subtarget &Subtarget,
13459 SelectionDAG &DAG) {
13460 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13461 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13462 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13463 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13465 // Whenever we can lower this as a zext, that instruction is strictly faster
13466 // than any alternative. It also allows us to fold memory operands into the
13467 // shuffle in many cases.
13468 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13469 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13472 // Use dedicated unpack instructions for masks that match their pattern.
13474 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13477 // Try to use shift instructions.
13478 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13479 Zeroable, Subtarget, DAG))
13482 // Try to use byte rotation instructions.
13483 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13484 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13487 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13488 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13491 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13492 if (Subtarget.hasVBMI())
13493 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13495 // Try to create an in-lane repeating shuffle mask and then shuffle the
13496 // the results into the target lanes.
13497 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13498 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13501 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13502 Zeroable, Subtarget, DAG))
13505 // FIXME: Implement direct support for this type!
13506 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13509 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13511 /// This routine either breaks down the specific type of a 512-bit x86 vector
13512 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13513 /// together based on the available instructions.
13514 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13515 MVT VT, SDValue V1, SDValue V2,
13516 const APInt &Zeroable,
13517 const X86Subtarget &Subtarget,
13518 SelectionDAG &DAG) {
13519 assert(Subtarget.hasAVX512() &&
13520 "Cannot lower 512-bit vectors w/ basic ISA!");
13522 // If we have a single input to the zero element, insert that into V1 if we
13523 // can do so cheaply.
13524 int NumElts = Mask.size();
13525 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13527 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13528 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13529 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13532 // Check for being able to broadcast a single element.
13533 if (SDValue Broadcast =
13534 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13537 // Dispatch to each element type for lowering. If we don't have support for
13538 // specific element type shuffles at 512 bits, immediately split them and
13539 // lower them. Each lowering routine of a given type is allowed to assume that
13540 // the requisite ISA extensions for that element type are available.
13541 switch (VT.SimpleTy) {
13543 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13545 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13547 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13549 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13551 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13553 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13556 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13560 // Lower vXi1 vector shuffles.
13561 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13562 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13563 // vector, shuffle and then truncate it back.
13564 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13565 MVT VT, SDValue V1, SDValue V2,
13566 const X86Subtarget &Subtarget,
13567 SelectionDAG &DAG) {
13568 assert(Subtarget.hasAVX512() &&
13569 "Cannot lower 512-bit vectors w/o basic ISA!");
13571 switch (VT.SimpleTy) {
13573 llvm_unreachable("Expected a vector of i1 elements");
13575 ExtVT = MVT::v2i64;
13578 ExtVT = MVT::v4i32;
13581 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13584 ExtVT = MVT::v16i32;
13587 ExtVT = MVT::v32i16;
13590 ExtVT = MVT::v64i8;
13594 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13595 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13596 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13597 V1 = getOnesVector(ExtVT, DAG, DL);
13599 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13602 V2 = DAG.getUNDEF(ExtVT);
13603 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13604 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13605 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13606 V2 = getOnesVector(ExtVT, DAG, DL);
13608 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13610 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13611 // i1 was sign extended we can use X86ISD::CVT2MASK.
13612 int NumElems = VT.getVectorNumElements();
13613 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13614 (Subtarget.hasDQI() && (NumElems < 32)))
13615 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13617 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13620 /// Helper function that returns true if the shuffle mask should be
13621 /// commuted to improve canonicalization.
13622 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13623 int NumElements = Mask.size();
13625 int NumV1Elements = 0, NumV2Elements = 0;
13629 else if (M < NumElements)
13634 // Commute the shuffle as needed such that more elements come from V1 than
13635 // V2. This allows us to match the shuffle pattern strictly on how many
13636 // elements come from V1 without handling the symmetric cases.
13637 if (NumV2Elements > NumV1Elements)
13640 assert(NumV1Elements > 0 && "No V1 indices");
13642 if (NumV2Elements == 0)
13645 // When the number of V1 and V2 elements are the same, try to minimize the
13646 // number of uses of V2 in the low half of the vector. When that is tied,
13647 // ensure that the sum of indices for V1 is equal to or lower than the sum
13648 // indices for V2. When those are equal, try to ensure that the number of odd
13649 // indices for V1 is lower than the number of odd indices for V2.
13650 if (NumV1Elements == NumV2Elements) {
13651 int LowV1Elements = 0, LowV2Elements = 0;
13652 for (int M : Mask.slice(0, NumElements / 2))
13653 if (M >= NumElements)
13657 if (LowV2Elements > LowV1Elements)
13659 if (LowV2Elements == LowV1Elements) {
13660 int SumV1Indices = 0, SumV2Indices = 0;
13661 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13662 if (Mask[i] >= NumElements)
13664 else if (Mask[i] >= 0)
13666 if (SumV2Indices < SumV1Indices)
13668 if (SumV2Indices == SumV1Indices) {
13669 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13670 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13671 if (Mask[i] >= NumElements)
13672 NumV2OddIndices += i % 2;
13673 else if (Mask[i] >= 0)
13674 NumV1OddIndices += i % 2;
13675 if (NumV2OddIndices < NumV1OddIndices)
13684 /// \brief Top-level lowering for x86 vector shuffles.
13686 /// This handles decomposition, canonicalization, and lowering of all x86
13687 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13688 /// above in helper routines. The canonicalization attempts to widen shuffles
13689 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13690 /// s.t. only one of the two inputs needs to be tested, etc.
13691 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13692 SelectionDAG &DAG) {
13693 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13694 ArrayRef<int> Mask = SVOp->getMask();
13695 SDValue V1 = Op.getOperand(0);
13696 SDValue V2 = Op.getOperand(1);
13697 MVT VT = Op.getSimpleValueType();
13698 int NumElements = VT.getVectorNumElements();
13700 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13702 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13703 "Can't lower MMX shuffles");
13705 bool V1IsUndef = V1.isUndef();
13706 bool V2IsUndef = V2.isUndef();
13707 if (V1IsUndef && V2IsUndef)
13708 return DAG.getUNDEF(VT);
13710 // When we create a shuffle node we put the UNDEF node to second operand,
13711 // but in some cases the first operand may be transformed to UNDEF.
13712 // In this case we should just commute the node.
13714 return DAG.getCommutedVectorShuffle(*SVOp);
13716 // Check for non-undef masks pointing at an undef vector and make the masks
13717 // undef as well. This makes it easier to match the shuffle based solely on
13721 if (M >= NumElements) {
13722 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13723 for (int &M : NewMask)
13724 if (M >= NumElements)
13726 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13729 // Check for illegal shuffle mask element index values.
13730 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13731 assert(llvm::all_of(Mask,
13732 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13733 "Out of bounds shuffle index");
13735 // We actually see shuffles that are entirely re-arrangements of a set of
13736 // zero inputs. This mostly happens while decomposing complex shuffles into
13737 // simple ones. Directly lower these as a buildvector of zeros.
13738 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13739 if (Zeroable.isAllOnesValue())
13740 return getZeroVector(VT, Subtarget, DAG, DL);
13742 // Try to collapse shuffles into using a vector type with fewer elements but
13743 // wider element types. We cap this to not form integers or floating point
13744 // elements wider than 64 bits, but it might be interesting to form i128
13745 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13746 SmallVector<int, 16> WidenedMask;
13747 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13748 canWidenShuffleElements(Mask, WidenedMask)) {
13749 MVT NewEltVT = VT.isFloatingPoint()
13750 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13751 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13752 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13753 // Make sure that the new vector type is legal. For example, v2f64 isn't
13755 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13756 V1 = DAG.getBitcast(NewVT, V1);
13757 V2 = DAG.getBitcast(NewVT, V2);
13758 return DAG.getBitcast(
13759 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13763 // Commute the shuffle if it will improve canonicalization.
13764 if (canonicalizeShuffleMaskWithCommute(Mask))
13765 return DAG.getCommutedVectorShuffle(*SVOp);
13767 // For each vector width, delegate to a specialized lowering routine.
13768 if (VT.is128BitVector())
13769 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13772 if (VT.is256BitVector())
13773 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13776 if (VT.is512BitVector())
13777 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13781 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13783 llvm_unreachable("Unimplemented!");
13786 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13787 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13788 const X86Subtarget &Subtarget,
13789 SelectionDAG &DAG) {
13790 SDValue Cond = Op.getOperand(0);
13791 SDValue LHS = Op.getOperand(1);
13792 SDValue RHS = Op.getOperand(2);
13794 MVT VT = Op.getSimpleValueType();
13796 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13798 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13800 // Only non-legal VSELECTs reach this lowering, convert those into generic
13801 // shuffles and re-use the shuffle lowering path for blends.
13802 SmallVector<int, 32> Mask;
13803 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13804 SDValue CondElt = CondBV->getOperand(i);
13806 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13809 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13812 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13813 // A vselect where all conditions and data are constants can be optimized into
13814 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13815 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13816 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13817 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13820 // Try to lower this to a blend-style vector shuffle. This can handle all
13821 // constant condition cases.
13822 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13825 // Variable blends are only legal from SSE4.1 onward.
13826 if (!Subtarget.hasSSE41())
13829 // Only some types will be legal on some subtargets. If we can emit a legal
13830 // VSELECT-matching blend, return Op, and but if we need to expand, return
13832 switch (Op.getSimpleValueType().SimpleTy) {
13834 // Most of the vector types have blends past SSE4.1.
13838 // The byte blends for AVX vectors were introduced only in AVX2.
13839 if (Subtarget.hasAVX2())
13846 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13847 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13850 // FIXME: We should custom lower this by fixing the condition and using i8
13856 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13857 MVT VT = Op.getSimpleValueType();
13860 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13863 if (VT.getSizeInBits() == 8) {
13864 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13865 Op.getOperand(0), Op.getOperand(1));
13866 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13867 DAG.getValueType(VT));
13868 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13871 if (VT == MVT::f32) {
13872 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13873 // the result back to FR32 register. It's only worth matching if the
13874 // result has a single use which is a store or a bitcast to i32. And in
13875 // the case of a store, it's not worth it if the index is a constant 0,
13876 // because a MOVSSmr can be used instead, which is smaller and faster.
13877 if (!Op.hasOneUse())
13879 SDNode *User = *Op.getNode()->use_begin();
13880 if ((User->getOpcode() != ISD::STORE ||
13881 isNullConstant(Op.getOperand(1))) &&
13882 (User->getOpcode() != ISD::BITCAST ||
13883 User->getValueType(0) != MVT::i32))
13885 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13886 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13888 return DAG.getBitcast(MVT::f32, Extract);
13891 if (VT == MVT::i32 || VT == MVT::i64) {
13892 // ExtractPS/pextrq works with constant index.
13893 if (isa<ConstantSDNode>(Op.getOperand(1)))
13900 /// Extract one bit from mask vector, like v16i1 or v8i1.
13901 /// AVX-512 feature.
13903 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13904 SDValue Vec = Op.getOperand(0);
13906 MVT VecVT = Vec.getSimpleValueType();
13907 SDValue Idx = Op.getOperand(1);
13908 MVT EltVT = Op.getSimpleValueType();
13910 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13911 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13912 "Unexpected vector type in ExtractBitFromMaskVector");
13914 // variable index can't be handled in mask registers,
13915 // extend vector to VR512/128
13916 if (!isa<ConstantSDNode>(Idx)) {
13917 unsigned NumElts = VecVT.getVectorNumElements();
13918 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
13919 // than extending to 128/256bit.
13920 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
13921 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
13922 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
13923 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13924 ExtVT.getVectorElementType(), Ext, Idx);
13925 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13928 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13929 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13930 (VecVT.getVectorNumElements() < 8)) {
13931 // Use kshiftlw/rw instruction.
13932 VecVT = MVT::v16i1;
13933 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13934 DAG.getUNDEF(VecVT),
13936 DAG.getIntPtrConstant(0, dl));
13938 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13939 if (MaxSift - IdxVal)
13940 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
13941 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13942 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
13943 DAG.getConstant(MaxSift, dl, MVT::i8));
13944 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13945 DAG.getIntPtrConstant(0, dl));
13949 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13950 SelectionDAG &DAG) const {
13952 SDValue Vec = Op.getOperand(0);
13953 MVT VecVT = Vec.getSimpleValueType();
13954 SDValue Idx = Op.getOperand(1);
13956 if (Op.getSimpleValueType() == MVT::i1)
13957 return ExtractBitFromMaskVector(Op, DAG);
13959 if (!isa<ConstantSDNode>(Idx)) {
13960 // Its more profitable to go through memory (1 cycles throughput)
13961 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
13962 // IACA tool was used to get performance estimation
13963 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
13965 // example : extractelement <16 x i8> %a, i32 %i
13967 // Block Throughput: 3.00 Cycles
13968 // Throughput Bottleneck: Port5
13970 // | Num Of | Ports pressure in cycles | |
13971 // | Uops | 0 - DV | 5 | 6 | 7 | |
13972 // ---------------------------------------------
13973 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
13974 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
13975 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
13976 // Total Num Of Uops: 4
13979 // Block Throughput: 1.00 Cycles
13980 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
13982 // | | Ports pressure in cycles | |
13983 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
13984 // ---------------------------------------------------------
13985 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
13986 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
13987 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
13988 // Total Num Of Uops: 4
13993 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13995 // If this is a 256-bit vector result, first extract the 128-bit vector and
13996 // then extract the element from the 128-bit vector.
13997 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13998 // Get the 128-bit vector.
13999 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14000 MVT EltVT = VecVT.getVectorElementType();
14002 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14003 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14005 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14006 // this can be done with a mask.
14007 IdxVal &= ElemsPerChunk - 1;
14008 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14009 DAG.getConstant(IdxVal, dl, MVT::i32));
14012 assert(VecVT.is128BitVector() && "Unexpected vector length");
14014 MVT VT = Op.getSimpleValueType();
14016 if (VT.getSizeInBits() == 16) {
14017 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14018 // we're going to zero extend the register or fold the store (SSE41 only).
14019 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14020 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14021 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14022 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14023 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14025 // Transform it so it match pextrw which produces a 32-bit result.
14026 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14027 Op.getOperand(0), Op.getOperand(1));
14028 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14029 DAG.getValueType(VT));
14030 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14033 if (Subtarget.hasSSE41())
14034 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14037 // TODO: We only extract a single element from v16i8, we can probably afford
14038 // to be more aggressive here before using the default approach of spilling to
14040 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14041 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14042 int DWordIdx = IdxVal / 4;
14043 if (DWordIdx == 0) {
14044 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14045 DAG.getBitcast(MVT::v4i32, Vec),
14046 DAG.getIntPtrConstant(DWordIdx, dl));
14047 int ShiftVal = (IdxVal % 4) * 8;
14049 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14050 DAG.getConstant(ShiftVal, dl, MVT::i32));
14051 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14054 int WordIdx = IdxVal / 2;
14055 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14056 DAG.getBitcast(MVT::v8i16, Vec),
14057 DAG.getIntPtrConstant(WordIdx, dl));
14058 int ShiftVal = (IdxVal % 2) * 8;
14060 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14061 DAG.getConstant(ShiftVal, dl, MVT::i16));
14062 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14065 if (VT.getSizeInBits() == 32) {
14069 // SHUFPS the element to the lowest double word, then movss.
14070 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14071 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14072 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14073 DAG.getIntPtrConstant(0, dl));
14076 if (VT.getSizeInBits() == 64) {
14077 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14078 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14079 // to match extract_elt for f64.
14083 // UNPCKHPD the element to the lowest double word, then movsd.
14084 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14085 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14086 int Mask[2] = { 1, -1 };
14087 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14088 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14089 DAG.getIntPtrConstant(0, dl));
14095 /// Insert one bit to mask vector, like v16i1 or v8i1.
14096 /// AVX-512 feature.
14098 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14100 SDValue Vec = Op.getOperand(0);
14101 SDValue Elt = Op.getOperand(1);
14102 SDValue Idx = Op.getOperand(2);
14103 MVT VecVT = Vec.getSimpleValueType();
14105 if (!isa<ConstantSDNode>(Idx)) {
14106 // Non constant index. Extend source and destination,
14107 // insert element and then truncate the result.
14108 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14109 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14110 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14111 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14112 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14113 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14116 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14117 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14118 unsigned NumElems = VecVT.getVectorNumElements();
14120 if(Vec.isUndef()) {
14122 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14123 DAG.getConstant(IdxVal, dl, MVT::i8));
14127 // Insertion of one bit into first or last position
14128 // can be done with two SHIFTs + OR.
14129 if (IdxVal == 0 ) {
14130 // EltInVec already at correct index and other bits are 0.
14131 // Clean the first bit in source vector.
14132 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14133 DAG.getConstant(1 , dl, MVT::i8));
14134 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14135 DAG.getConstant(1, dl, MVT::i8));
14137 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14139 if (IdxVal == NumElems -1) {
14140 // Move the bit to the last position inside the vector.
14141 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14142 DAG.getConstant(IdxVal, dl, MVT::i8));
14143 // Clean the last bit in the source vector.
14144 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14145 DAG.getConstant(1, dl, MVT::i8));
14146 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14147 DAG.getConstant(1 , dl, MVT::i8));
14149 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14152 // Use shuffle to insert element.
14153 SmallVector<int, 64> MaskVec(NumElems);
14154 for (unsigned i = 0; i != NumElems; ++i)
14155 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14157 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14160 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14161 SelectionDAG &DAG) const {
14162 MVT VT = Op.getSimpleValueType();
14163 MVT EltVT = VT.getVectorElementType();
14164 unsigned NumElts = VT.getVectorNumElements();
14166 if (EltVT == MVT::i1)
14167 return InsertBitToMaskVector(Op, DAG);
14170 SDValue N0 = Op.getOperand(0);
14171 SDValue N1 = Op.getOperand(1);
14172 SDValue N2 = Op.getOperand(2);
14173 if (!isa<ConstantSDNode>(N2))
14175 auto *N2C = cast<ConstantSDNode>(N2);
14176 unsigned IdxVal = N2C->getZExtValue();
14178 bool IsZeroElt = X86::isZeroNode(N1);
14179 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14181 // If we are inserting a element, see if we can do this more efficiently with
14182 // a blend shuffle with a rematerializable vector than a costly integer
14184 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
14185 // be beneficial if we are inserting several zeros and can combine the masks.
14186 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
14187 SmallVector<int, 8> BlendMask;
14188 for (unsigned i = 0; i != NumElts; ++i)
14189 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14190 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14191 : DAG.getConstant(-1, dl, VT);
14192 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14195 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14196 // into that, and then insert the subvector back into the result.
14197 if (VT.is256BitVector() || VT.is512BitVector()) {
14198 // With a 256-bit vector, we can insert into the zero element efficiently
14199 // using a blend if we have AVX or AVX2 and the right data type.
14200 if (VT.is256BitVector() && IdxVal == 0) {
14201 // TODO: It is worthwhile to cast integer to floating point and back
14202 // and incur a domain crossing penalty if that's what we'll end up
14203 // doing anyway after extracting to a 128-bit vector.
14204 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14205 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14206 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14207 N2 = DAG.getIntPtrConstant(1, dl);
14208 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14212 // Get the desired 128-bit vector chunk.
14213 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14215 // Insert the element into the desired chunk.
14216 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14217 assert(isPowerOf2_32(NumEltsIn128));
14218 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14219 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14221 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14222 DAG.getConstant(IdxIn128, dl, MVT::i32));
14224 // Insert the changed part back into the bigger vector
14225 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14227 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14229 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14230 // argument. SSE41 required for pinsrb.
14231 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14233 if (VT == MVT::v8i16) {
14234 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14235 Opc = X86ISD::PINSRW;
14237 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14238 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14239 Opc = X86ISD::PINSRB;
14242 if (N1.getValueType() != MVT::i32)
14243 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14244 if (N2.getValueType() != MVT::i32)
14245 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14246 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14249 if (Subtarget.hasSSE41()) {
14250 if (EltVT == MVT::f32) {
14251 // Bits [7:6] of the constant are the source select. This will always be
14252 // zero here. The DAG Combiner may combine an extract_elt index into
14253 // these bits. For example (insert (extract, 3), 2) could be matched by
14254 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14255 // Bits [5:4] of the constant are the destination select. This is the
14256 // value of the incoming immediate.
14257 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14258 // combine either bitwise AND or insert of float 0.0 to set these bits.
14260 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14261 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14262 // If this is an insertion of 32-bits into the low 32-bits of
14263 // a vector, we prefer to generate a blend with immediate rather
14264 // than an insertps. Blends are simpler operations in hardware and so
14265 // will always have equal or better performance than insertps.
14266 // But if optimizing for size and there's a load folding opportunity,
14267 // generate insertps because blendps does not have a 32-bit memory
14269 N2 = DAG.getIntPtrConstant(1, dl);
14270 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14271 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14273 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14274 // Create this as a scalar to vector..
14275 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14276 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14279 // PINSR* works with constant index.
14280 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14287 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14288 SelectionDAG &DAG) {
14290 MVT OpVT = Op.getSimpleValueType();
14292 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14294 if (X86::isZeroNode(Op.getOperand(0)))
14295 return getZeroVector(OpVT, Subtarget, DAG, dl);
14297 // If this is a 256-bit vector result, first insert into a 128-bit
14298 // vector and then insert into the 256-bit vector.
14299 if (!OpVT.is128BitVector()) {
14300 // Insert into a 128-bit vector.
14301 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14302 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14303 OpVT.getVectorNumElements() / SizeFactor);
14305 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14307 // Insert the 128-bit vector.
14308 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14310 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14312 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14313 if (OpVT == MVT::v4i32)
14316 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14317 return DAG.getBitcast(
14318 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14321 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14322 // a simple subregister reference or explicit instructions to grab
14323 // upper bits of a vector.
14324 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14325 SelectionDAG &DAG) {
14326 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14329 SDValue In = Op.getOperand(0);
14330 SDValue Idx = Op.getOperand(1);
14331 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14332 MVT ResVT = Op.getSimpleValueType();
14334 assert((In.getSimpleValueType().is256BitVector() ||
14335 In.getSimpleValueType().is512BitVector()) &&
14336 "Can only extract from 256-bit or 512-bit vectors");
14338 // If the input is a buildvector just emit a smaller one.
14339 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14340 if (In.getOpcode() == ISD::BUILD_VECTOR)
14341 return DAG.getBuildVector(
14342 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14344 // Everything else is legal.
14348 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14349 // simple superregister reference or explicit instructions to insert
14350 // the upper bits of a vector.
14351 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14352 SelectionDAG &DAG) {
14353 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14355 return insert1BitVector(Op, DAG, Subtarget);
14358 // Returns the appropriate wrapper opcode for a global reference.
14359 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14360 // References to absolute symbols are never PC-relative.
14361 if (GV && GV->isAbsoluteSymbolRef())
14362 return X86ISD::Wrapper;
14364 CodeModel::Model M = getTargetMachine().getCodeModel();
14365 if (Subtarget.isPICStyleRIPRel() &&
14366 (M == CodeModel::Small || M == CodeModel::Kernel))
14367 return X86ISD::WrapperRIP;
14369 return X86ISD::Wrapper;
14372 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14373 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14374 // one of the above mentioned nodes. It has to be wrapped because otherwise
14375 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14376 // be used to form addressing mode. These wrapped nodes will be selected
14379 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14380 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14382 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14383 // global base reg.
14384 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14386 auto PtrVT = getPointerTy(DAG.getDataLayout());
14387 SDValue Result = DAG.getTargetConstantPool(
14388 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14390 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14391 // With PIC, the address is actually $g + Offset.
14394 DAG.getNode(ISD::ADD, DL, PtrVT,
14395 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14401 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14402 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14404 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14405 // global base reg.
14406 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14408 auto PtrVT = getPointerTy(DAG.getDataLayout());
14409 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14411 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14413 // With PIC, the address is actually $g + Offset.
14416 DAG.getNode(ISD::ADD, DL, PtrVT,
14417 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14423 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14424 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14426 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14427 // global base reg.
14428 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14429 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14431 auto PtrVT = getPointerTy(DAG.getDataLayout());
14432 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14435 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14437 // With PIC, the address is actually $g + Offset.
14438 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14440 DAG.getNode(ISD::ADD, DL, PtrVT,
14441 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14444 // For symbols that require a load from a stub to get the address, emit the
14446 if (isGlobalStubReference(OpFlag))
14447 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14448 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14454 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14455 // Create the TargetBlockAddressAddress node.
14456 unsigned char OpFlags =
14457 Subtarget.classifyBlockAddressReference();
14458 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14459 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14461 auto PtrVT = getPointerTy(DAG.getDataLayout());
14462 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14463 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14465 // With PIC, the address is actually $g + Offset.
14466 if (isGlobalRelativeToPICBase(OpFlags)) {
14467 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14468 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14474 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14475 const SDLoc &dl, int64_t Offset,
14476 SelectionDAG &DAG) const {
14477 // Create the TargetGlobalAddress node, folding in the constant
14478 // offset if it is legal.
14479 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14480 CodeModel::Model M = DAG.getTarget().getCodeModel();
14481 auto PtrVT = getPointerTy(DAG.getDataLayout());
14483 if (OpFlags == X86II::MO_NO_FLAG &&
14484 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14485 // A direct static reference to a global.
14486 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14489 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14492 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14494 // With PIC, the address is actually $g + Offset.
14495 if (isGlobalRelativeToPICBase(OpFlags)) {
14496 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14497 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14500 // For globals that require a load from a stub to get the address, emit the
14502 if (isGlobalStubReference(OpFlags))
14503 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14504 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14506 // If there was a non-zero offset that we didn't fold, create an explicit
14507 // addition for it.
14509 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14510 DAG.getConstant(Offset, dl, PtrVT));
14516 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14517 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14518 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14519 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14523 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14524 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14525 unsigned char OperandFlags, bool LocalDynamic = false) {
14526 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14527 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14529 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14530 GA->getValueType(0),
14534 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14538 SDValue Ops[] = { Chain, TGA, *InFlag };
14539 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14541 SDValue Ops[] = { Chain, TGA };
14542 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14545 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14546 MFI.setAdjustsStack(true);
14547 MFI.setHasCalls(true);
14549 SDValue Flag = Chain.getValue(1);
14550 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14553 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14555 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14558 SDLoc dl(GA); // ? function entry point might be better
14559 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14560 DAG.getNode(X86ISD::GlobalBaseReg,
14561 SDLoc(), PtrVT), InFlag);
14562 InFlag = Chain.getValue(1);
14564 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14567 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14569 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14571 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14572 X86::RAX, X86II::MO_TLSGD);
14575 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14581 // Get the start address of the TLS block for this module.
14582 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14583 .getInfo<X86MachineFunctionInfo>();
14584 MFI->incNumLocalDynamicTLSAccesses();
14588 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14589 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14592 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14593 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14594 InFlag = Chain.getValue(1);
14595 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14596 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14599 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14603 unsigned char OperandFlags = X86II::MO_DTPOFF;
14604 unsigned WrapperKind = X86ISD::Wrapper;
14605 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14606 GA->getValueType(0),
14607 GA->getOffset(), OperandFlags);
14608 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14610 // Add x@dtpoff with the base.
14611 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14614 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14615 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14616 const EVT PtrVT, TLSModel::Model model,
14617 bool is64Bit, bool isPIC) {
14620 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14621 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14622 is64Bit ? 257 : 256));
14624 SDValue ThreadPointer =
14625 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14626 MachinePointerInfo(Ptr));
14628 unsigned char OperandFlags = 0;
14629 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14631 unsigned WrapperKind = X86ISD::Wrapper;
14632 if (model == TLSModel::LocalExec) {
14633 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14634 } else if (model == TLSModel::InitialExec) {
14636 OperandFlags = X86II::MO_GOTTPOFF;
14637 WrapperKind = X86ISD::WrapperRIP;
14639 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14642 llvm_unreachable("Unexpected model");
14645 // emit "addl x@ntpoff,%eax" (local exec)
14646 // or "addl x@indntpoff,%eax" (initial exec)
14647 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14649 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14650 GA->getOffset(), OperandFlags);
14651 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14653 if (model == TLSModel::InitialExec) {
14654 if (isPIC && !is64Bit) {
14655 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14656 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14660 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14661 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14664 // The address of the thread local variable is the add of the thread
14665 // pointer with the offset of the variable.
14666 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14670 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14672 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14674 if (DAG.getTarget().Options.EmulatedTLS)
14675 return LowerToTLSEmulatedModel(GA, DAG);
14677 const GlobalValue *GV = GA->getGlobal();
14678 auto PtrVT = getPointerTy(DAG.getDataLayout());
14679 bool PositionIndependent = isPositionIndependent();
14681 if (Subtarget.isTargetELF()) {
14682 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14684 case TLSModel::GeneralDynamic:
14685 if (Subtarget.is64Bit())
14686 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14687 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14688 case TLSModel::LocalDynamic:
14689 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14690 Subtarget.is64Bit());
14691 case TLSModel::InitialExec:
14692 case TLSModel::LocalExec:
14693 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14694 PositionIndependent);
14696 llvm_unreachable("Unknown TLS model.");
14699 if (Subtarget.isTargetDarwin()) {
14700 // Darwin only has one model of TLS. Lower to that.
14701 unsigned char OpFlag = 0;
14702 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14703 X86ISD::WrapperRIP : X86ISD::Wrapper;
14705 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14706 // global base reg.
14707 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14709 OpFlag = X86II::MO_TLVP_PIC_BASE;
14711 OpFlag = X86II::MO_TLVP;
14713 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14714 GA->getValueType(0),
14715 GA->getOffset(), OpFlag);
14716 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14718 // With PIC32, the address is actually $g + Offset.
14720 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14721 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14724 // Lowering the machine isd will make sure everything is in the right
14726 SDValue Chain = DAG.getEntryNode();
14727 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14728 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14729 SDValue Args[] = { Chain, Offset };
14730 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14731 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14732 DAG.getIntPtrConstant(0, DL, true),
14733 Chain.getValue(1), DL);
14735 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14736 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14737 MFI.setAdjustsStack(true);
14739 // And our return value (tls address) is in the standard call return value
14741 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14742 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14745 if (Subtarget.isTargetKnownWindowsMSVC() ||
14746 Subtarget.isTargetWindowsItanium() ||
14747 Subtarget.isTargetWindowsGNU()) {
14748 // Just use the implicit TLS architecture
14749 // Need to generate something similar to:
14750 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14752 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14753 // mov rcx, qword [rdx+rcx*8]
14754 // mov eax, .tls$:tlsvar
14755 // [rax+rcx] contains the address
14756 // Windows 64bit: gs:0x58
14757 // Windows 32bit: fs:__tls_array
14760 SDValue Chain = DAG.getEntryNode();
14762 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14763 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14764 // use its literal value of 0x2C.
14765 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14766 ? Type::getInt8PtrTy(*DAG.getContext(),
14768 : Type::getInt32PtrTy(*DAG.getContext(),
14771 SDValue TlsArray = Subtarget.is64Bit()
14772 ? DAG.getIntPtrConstant(0x58, dl)
14773 : (Subtarget.isTargetWindowsGNU()
14774 ? DAG.getIntPtrConstant(0x2C, dl)
14775 : DAG.getExternalSymbol("_tls_array", PtrVT));
14777 SDValue ThreadPointer =
14778 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14781 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14782 res = ThreadPointer;
14784 // Load the _tls_index variable
14785 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14786 if (Subtarget.is64Bit())
14787 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14788 MachinePointerInfo(), MVT::i32);
14790 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14792 auto &DL = DAG.getDataLayout();
14794 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14795 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14797 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14800 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14802 // Get the offset of start of .tls section
14803 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14804 GA->getValueType(0),
14805 GA->getOffset(), X86II::MO_SECREL);
14806 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14808 // The address of the thread local variable is the add of the thread
14809 // pointer with the offset of the variable.
14810 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14813 llvm_unreachable("TLS not implemented for this target.");
14816 /// Lower SRA_PARTS and friends, which return two i32 values
14817 /// and take a 2 x i32 value to shift plus a shift amount.
14818 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14819 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14820 MVT VT = Op.getSimpleValueType();
14821 unsigned VTBits = VT.getSizeInBits();
14823 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14824 SDValue ShOpLo = Op.getOperand(0);
14825 SDValue ShOpHi = Op.getOperand(1);
14826 SDValue ShAmt = Op.getOperand(2);
14827 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14828 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14830 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14831 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14832 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14833 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14834 : DAG.getConstant(0, dl, VT);
14836 SDValue Tmp2, Tmp3;
14837 if (Op.getOpcode() == ISD::SHL_PARTS) {
14838 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14839 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14841 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14842 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14845 // If the shift amount is larger or equal than the width of a part we can't
14846 // rely on the results of shld/shrd. Insert a test and select the appropriate
14847 // values for large shift amounts.
14848 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14849 DAG.getConstant(VTBits, dl, MVT::i8));
14850 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14851 AndNode, DAG.getConstant(0, dl, MVT::i8));
14854 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14855 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14856 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14858 if (Op.getOpcode() == ISD::SHL_PARTS) {
14859 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14860 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14862 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14863 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14866 SDValue Ops[2] = { Lo, Hi };
14867 return DAG.getMergeValues(Ops, dl);
14870 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14871 SelectionDAG &DAG) const {
14872 SDValue Src = Op.getOperand(0);
14873 MVT SrcVT = Src.getSimpleValueType();
14874 MVT VT = Op.getSimpleValueType();
14877 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14878 if (SrcVT.isVector()) {
14879 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14880 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14881 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14882 DAG.getUNDEF(SrcVT)));
14884 if (SrcVT.getVectorElementType() == MVT::i1) {
14885 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14886 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14887 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14888 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14889 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14890 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14895 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14896 "Unknown SINT_TO_FP to lower!");
14898 // These are really Legal; return the operand so the caller accepts it as
14900 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14902 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14903 Subtarget.is64Bit()) {
14907 SDValue ValueToStore = Op.getOperand(0);
14908 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14909 !Subtarget.is64Bit())
14910 // Bitcasting to f64 here allows us to do a single 64-bit store from
14911 // an SSE register, avoiding the store forwarding penalty that would come
14912 // with two 32-bit stores.
14913 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14915 unsigned Size = SrcVT.getSizeInBits()/8;
14916 MachineFunction &MF = DAG.getMachineFunction();
14917 auto PtrVT = getPointerTy(MF.getDataLayout());
14918 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14919 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14920 SDValue Chain = DAG.getStore(
14921 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14922 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14923 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14926 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14928 SelectionDAG &DAG) const {
14932 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14934 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14936 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14938 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14940 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14941 MachineMemOperand *MMO;
14943 int SSFI = FI->getIndex();
14944 MMO = DAG.getMachineFunction().getMachineMemOperand(
14945 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14946 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14948 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14949 StackSlot = StackSlot.getOperand(1);
14951 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14952 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14954 Tys, Ops, SrcVT, MMO);
14957 Chain = Result.getValue(1);
14958 SDValue InFlag = Result.getValue(2);
14960 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14961 // shouldn't be necessary except that RFP cannot be live across
14962 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14963 MachineFunction &MF = DAG.getMachineFunction();
14964 unsigned SSFISize = Op.getValueSizeInBits()/8;
14965 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
14966 auto PtrVT = getPointerTy(MF.getDataLayout());
14967 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14968 Tys = DAG.getVTList(MVT::Other);
14970 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14972 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14973 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14974 MachineMemOperand::MOStore, SSFISize, SSFISize);
14976 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14977 Ops, Op.getValueType(), MMO);
14978 Result = DAG.getLoad(
14979 Op.getValueType(), DL, Chain, StackSlot,
14980 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14986 /// 64-bit unsigned integer to double expansion.
14987 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14988 SelectionDAG &DAG) const {
14989 // This algorithm is not obvious. Here it is what we're trying to output:
14992 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14993 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14995 haddpd %xmm0, %xmm0
14997 pshufd $0x4e, %xmm0, %xmm1
15003 LLVMContext *Context = DAG.getContext();
15005 // Build some magic constants.
15006 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15007 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15008 auto PtrVT = getPointerTy(DAG.getDataLayout());
15009 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15011 SmallVector<Constant*,2> CV1;
15013 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15014 APInt(64, 0x4330000000000000ULL))));
15016 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15017 APInt(64, 0x4530000000000000ULL))));
15018 Constant *C1 = ConstantVector::get(CV1);
15019 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15021 // Load the 64-bit value into an XMM register.
15022 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15025 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15026 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15027 /* Alignment = */ 16);
15029 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15032 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15033 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15034 /* Alignment = */ 16);
15035 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15036 // TODO: Are there any fast-math-flags to propagate here?
15037 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15040 if (Subtarget.hasSSE3()) {
15041 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15042 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15044 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15045 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15046 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15047 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15050 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15051 DAG.getIntPtrConstant(0, dl));
15054 /// 32-bit unsigned integer to float expansion.
15055 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15056 SelectionDAG &DAG) const {
15058 // FP constant to bias correct the final result.
15059 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15062 // Load the 32-bit value into an XMM register.
15063 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15066 // Zero out the upper parts of the register.
15067 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15069 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15070 DAG.getBitcast(MVT::v2f64, Load),
15071 DAG.getIntPtrConstant(0, dl));
15073 // Or the load with the bias.
15074 SDValue Or = DAG.getNode(
15075 ISD::OR, dl, MVT::v2i64,
15076 DAG.getBitcast(MVT::v2i64,
15077 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15078 DAG.getBitcast(MVT::v2i64,
15079 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15081 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15082 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15084 // Subtract the bias.
15085 // TODO: Are there any fast-math-flags to propagate here?
15086 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15088 // Handle final rounding.
15089 MVT DestVT = Op.getSimpleValueType();
15091 if (DestVT.bitsLT(MVT::f64))
15092 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15093 DAG.getIntPtrConstant(0, dl));
15094 if (DestVT.bitsGT(MVT::f64))
15095 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15097 // Handle final rounding.
15101 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15102 const X86Subtarget &Subtarget, SDLoc &DL) {
15103 if (Op.getSimpleValueType() != MVT::v2f64)
15106 SDValue N0 = Op.getOperand(0);
15107 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15109 // Legalize to v4i32 type.
15110 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15111 DAG.getUNDEF(MVT::v2i32));
15113 if (Subtarget.hasAVX512())
15114 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15116 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15117 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15118 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15119 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15121 // Two to the power of half-word-size.
15122 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15124 // Clear upper part of LO, lower HI.
15125 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15126 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15128 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15129 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15130 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15132 // Add the two halves.
15133 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15136 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15137 const X86Subtarget &Subtarget) {
15138 // The algorithm is the following:
15139 // #ifdef __SSE4_1__
15140 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15141 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15142 // (uint4) 0x53000000, 0xaa);
15144 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15145 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15147 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15148 // return (float4) lo + fhi;
15150 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15151 // reassociate the two FADDs, and if we do that, the algorithm fails
15152 // spectacularly (PR24512).
15153 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15154 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15155 // there's also the MachineCombiner reassociations happening on Machine IR.
15156 if (DAG.getTarget().Options.UnsafeFPMath)
15160 SDValue V = Op->getOperand(0);
15161 MVT VecIntVT = V.getSimpleValueType();
15162 bool Is128 = VecIntVT == MVT::v4i32;
15163 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15164 // If we convert to something else than the supported type, e.g., to v4f64,
15166 if (VecFloatVT != Op->getSimpleValueType(0))
15169 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15170 "Unsupported custom type");
15172 // In the #idef/#else code, we have in common:
15173 // - The vector of constants:
15179 // Create the splat vector for 0x4b000000.
15180 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15181 // Create the splat vector for 0x53000000.
15182 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15184 // Create the right shift.
15185 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15186 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15189 if (Subtarget.hasSSE41()) {
15190 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15191 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15192 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15193 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15194 // Low will be bitcasted right away, so do not bother bitcasting back to its
15196 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15197 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15198 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15199 // (uint4) 0x53000000, 0xaa);
15200 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15201 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15202 // High will be bitcasted right away, so do not bother bitcasting back to
15203 // its original type.
15204 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15205 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15207 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15208 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15209 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15210 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15212 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15213 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15216 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15217 SDValue VecCstFAdd = DAG.getConstantFP(
15218 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15220 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15221 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15222 // TODO: Are there any fast-math-flags to propagate here?
15224 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15225 // return (float4) lo + fhi;
15226 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15227 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15230 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15231 SelectionDAG &DAG) const {
15232 SDValue N0 = Op.getOperand(0);
15233 MVT SrcVT = N0.getSimpleValueType();
15236 if (SrcVT.getVectorElementType() == MVT::i1) {
15237 if (SrcVT == MVT::v2i1)
15238 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15239 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15240 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15241 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15242 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15245 switch (SrcVT.SimpleTy) {
15247 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15252 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15253 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15254 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15257 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15260 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15263 assert(Subtarget.hasAVX512());
15264 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15265 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15269 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15270 SelectionDAG &DAG) const {
15271 SDValue N0 = Op.getOperand(0);
15273 auto PtrVT = getPointerTy(DAG.getDataLayout());
15275 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15276 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15277 // the optimization here.
15278 if (DAG.SignBitIsZero(N0))
15279 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15281 if (Op.getSimpleValueType().isVector())
15282 return lowerUINT_TO_FP_vec(Op, DAG);
15284 MVT SrcVT = N0.getSimpleValueType();
15285 MVT DstVT = Op.getSimpleValueType();
15287 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15288 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15289 // Conversions from unsigned i32 to f32/f64 are legal,
15290 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15294 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15295 return LowerUINT_TO_FP_i64(Op, DAG);
15296 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15297 return LowerUINT_TO_FP_i32(Op, DAG);
15298 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15301 // Make a 64-bit buffer, and use it to build an FILD.
15302 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15303 if (SrcVT == MVT::i32) {
15304 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15305 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15306 StackSlot, MachinePointerInfo());
15307 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15308 OffsetSlot, MachinePointerInfo());
15309 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15313 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15314 SDValue ValueToStore = Op.getOperand(0);
15315 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15316 // Bitcasting to f64 here allows us to do a single 64-bit store from
15317 // an SSE register, avoiding the store forwarding penalty that would come
15318 // with two 32-bit stores.
15319 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15320 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15321 MachinePointerInfo());
15322 // For i64 source, we need to add the appropriate power of 2 if the input
15323 // was negative. This is the same as the optimization in
15324 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15325 // we must be careful to do the computation in x87 extended precision, not
15326 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15327 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15328 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15329 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15330 MachineMemOperand::MOLoad, 8, 8);
15332 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15333 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15334 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15337 APInt FF(32, 0x5F800000ULL);
15339 // Check whether the sign bit is set.
15340 SDValue SignSet = DAG.getSetCC(
15341 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15342 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15344 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15345 SDValue FudgePtr = DAG.getConstantPool(
15346 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15348 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15349 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15350 SDValue Four = DAG.getIntPtrConstant(4, dl);
15351 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
15353 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15355 // Load the value out, extending it from f32 to f80.
15356 // FIXME: Avoid the extend by constructing the right constant pool?
15357 SDValue Fudge = DAG.getExtLoad(
15358 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15359 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15360 /* Alignment = */ 4);
15361 // Extend everything to 80 bits to force it to be done on x87.
15362 // TODO: Are there any fast-math-flags to propagate here?
15363 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15364 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15365 DAG.getIntPtrConstant(0, dl));
15368 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15369 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15370 // just return an <SDValue(), SDValue()> pair.
15371 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15372 // to i16, i32 or i64, and we lower it to a legal sequence.
15373 // If lowered to the final integer result we return a <result, SDValue()> pair.
15374 // Otherwise we lower it to a sequence ending with a FIST, return a
15375 // <FIST, StackSlot> pair, and the caller is responsible for loading
15376 // the final integer result from StackSlot.
15377 std::pair<SDValue,SDValue>
15378 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15379 bool IsSigned, bool IsReplace) const {
15382 EVT DstTy = Op.getValueType();
15383 EVT TheVT = Op.getOperand(0).getValueType();
15384 auto PtrVT = getPointerTy(DAG.getDataLayout());
15386 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15387 // f16 must be promoted before using the lowering in this routine.
15388 // fp128 does not use this lowering.
15389 return std::make_pair(SDValue(), SDValue());
15392 // If using FIST to compute an unsigned i64, we'll need some fixup
15393 // to handle values above the maximum signed i64. A FIST is always
15394 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15395 bool UnsignedFixup = !IsSigned &&
15396 DstTy == MVT::i64 &&
15397 (!Subtarget.is64Bit() ||
15398 !isScalarFPTypeInSSEReg(TheVT));
15400 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15401 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15402 // The low 32 bits of the fist result will have the correct uint32 result.
15403 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15407 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15408 DstTy.getSimpleVT() >= MVT::i16 &&
15409 "Unknown FP_TO_INT to lower!");
15411 // These are really Legal.
15412 if (DstTy == MVT::i32 &&
15413 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15414 return std::make_pair(SDValue(), SDValue());
15415 if (Subtarget.is64Bit() &&
15416 DstTy == MVT::i64 &&
15417 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15418 return std::make_pair(SDValue(), SDValue());
15420 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15422 MachineFunction &MF = DAG.getMachineFunction();
15423 unsigned MemSize = DstTy.getSizeInBits()/8;
15424 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15425 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15428 switch (DstTy.getSimpleVT().SimpleTy) {
15429 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15430 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15431 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15432 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15435 SDValue Chain = DAG.getEntryNode();
15436 SDValue Value = Op.getOperand(0);
15437 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15439 if (UnsignedFixup) {
15441 // Conversion to unsigned i64 is implemented with a select,
15442 // depending on whether the source value fits in the range
15443 // of a signed i64. Let Thresh be the FP equivalent of
15444 // 0x8000000000000000ULL.
15446 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15447 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15448 // Fist-to-mem64 FistSrc
15449 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15450 // to XOR'ing the high 32 bits with Adjust.
15452 // Being a power of 2, Thresh is exactly representable in all FP formats.
15453 // For X87 we'd like to use the smallest FP type for this constant, but
15454 // for DAG type consistency we have to match the FP operand type.
15456 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15457 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15458 bool LosesInfo = false;
15459 if (TheVT == MVT::f64)
15460 // The rounding mode is irrelevant as the conversion should be exact.
15461 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15463 else if (TheVT == MVT::f80)
15464 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15465 APFloat::rmNearestTiesToEven, &LosesInfo);
15467 assert(Status == APFloat::opOK && !LosesInfo &&
15468 "FP conversion should have been exact");
15470 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15472 SDValue Cmp = DAG.getSetCC(DL,
15473 getSetCCResultType(DAG.getDataLayout(),
15474 *DAG.getContext(), TheVT),
15475 Value, ThreshVal, ISD::SETLT);
15476 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15477 DAG.getConstant(0, DL, MVT::i32),
15478 DAG.getConstant(0x80000000, DL, MVT::i32));
15479 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15480 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15481 *DAG.getContext(), TheVT),
15482 Value, ThreshVal, ISD::SETLT);
15483 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15486 // FIXME This causes a redundant load/store if the SSE-class value is already
15487 // in memory, such as if it is on the callstack.
15488 if (isScalarFPTypeInSSEReg(TheVT)) {
15489 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15490 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15491 MachinePointerInfo::getFixedStack(MF, SSFI));
15492 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15494 Chain, StackSlot, DAG.getValueType(TheVT)
15497 MachineMemOperand *MMO =
15498 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15499 MachineMemOperand::MOLoad, MemSize, MemSize);
15500 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15501 Chain = Value.getValue(1);
15502 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15503 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15506 MachineMemOperand *MMO =
15507 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15508 MachineMemOperand::MOStore, MemSize, MemSize);
15510 if (UnsignedFixup) {
15512 // Insert the FIST, load its result as two i32's,
15513 // and XOR the high i32 with Adjust.
15515 SDValue FistOps[] = { Chain, Value, StackSlot };
15516 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15517 FistOps, DstTy, MMO);
15520 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15521 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15524 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15525 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15527 if (Subtarget.is64Bit()) {
15528 // Join High32 and Low32 into a 64-bit result.
15529 // (High32 << 32) | Low32
15530 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15531 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15532 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15533 DAG.getConstant(32, DL, MVT::i8));
15534 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15535 return std::make_pair(Result, SDValue());
15538 SDValue ResultOps[] = { Low32, High32 };
15540 SDValue pair = IsReplace
15541 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15542 : DAG.getMergeValues(ResultOps, DL);
15543 return std::make_pair(pair, SDValue());
15545 // Build the FP_TO_INT*_IN_MEM
15546 SDValue Ops[] = { Chain, Value, StackSlot };
15547 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15549 return std::make_pair(FIST, StackSlot);
15553 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15554 const X86Subtarget &Subtarget) {
15555 MVT VT = Op->getSimpleValueType(0);
15556 SDValue In = Op->getOperand(0);
15557 MVT InVT = In.getSimpleValueType();
15560 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15561 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15563 // Optimize vectors in AVX mode:
15566 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15567 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15568 // Concat upper and lower parts.
15571 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15572 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15573 // Concat upper and lower parts.
15576 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15577 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15578 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15581 if (Subtarget.hasInt256())
15582 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15584 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15585 SDValue Undef = DAG.getUNDEF(InVT);
15586 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15587 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15588 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15590 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15591 VT.getVectorNumElements()/2);
15593 OpLo = DAG.getBitcast(HVT, OpLo);
15594 OpHi = DAG.getBitcast(HVT, OpHi);
15596 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15599 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15600 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15601 MVT VT = Op->getSimpleValueType(0);
15602 SDValue In = Op->getOperand(0);
15603 MVT InVT = In.getSimpleValueType();
15605 unsigned NumElts = VT.getVectorNumElements();
15607 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15608 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15609 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15611 if (InVT.getVectorElementType() != MVT::i1)
15614 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15616 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15617 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15620 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15622 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15624 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15626 return SelectedVal;
15627 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15630 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15631 SelectionDAG &DAG) {
15632 if (Subtarget.hasFp256())
15633 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15639 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15640 SelectionDAG &DAG) {
15642 MVT VT = Op.getSimpleValueType();
15643 SDValue In = Op.getOperand(0);
15644 MVT SVT = In.getSimpleValueType();
15646 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15647 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15649 if (Subtarget.hasFp256())
15650 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15653 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15654 VT.getVectorNumElements() != SVT.getVectorNumElements());
15658 /// Helper to recursively truncate vector elements in half with PACKSS.
15659 /// It makes use of the fact that vector comparison results will be all-zeros
15660 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15661 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15662 /// within each 128-bit lane.
15663 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15666 const X86Subtarget &Subtarget) {
15667 // Requires SSE2 but AVX512 has fast truncate.
15668 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15671 EVT SrcVT = In.getValueType();
15673 // No truncation required, we might get here due to recursive calls.
15674 if (SrcVT == DstVT)
15677 // We only support vector truncation to 128bits or greater from a
15678 // 256bits or greater source.
15679 if ((DstVT.getSizeInBits() % 128) != 0)
15681 if ((SrcVT.getSizeInBits() % 256) != 0)
15684 unsigned NumElems = SrcVT.getVectorNumElements();
15685 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15686 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15689 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15691 // Extract lower/upper subvectors.
15692 unsigned NumSubElts = NumElems / 2;
15693 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15694 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15695 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15697 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15698 if (SrcVT.is256BitVector()) {
15699 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15700 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15701 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15702 return DAG.getBitcast(DstVT, Res);
15705 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15706 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15707 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15708 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15709 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15710 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15712 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15713 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15714 Res = DAG.getBitcast(MVT::v4i64, Res);
15715 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15717 if (DstVT.is256BitVector())
15718 return DAG.getBitcast(DstVT, Res);
15720 // If 512bit -> 128bit truncate another stage.
15721 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15722 Res = DAG.getBitcast(PackedVT, Res);
15723 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15726 // Recursively pack lower/upper subvectors, concat result and pack again.
15727 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15728 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15729 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15730 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15732 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15733 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15734 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15737 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15738 const X86Subtarget &Subtarget) {
15741 MVT VT = Op.getSimpleValueType();
15742 SDValue In = Op.getOperand(0);
15743 MVT InVT = In.getSimpleValueType();
15745 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15747 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15748 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15749 if (InVT.getScalarSizeInBits() <= 16) {
15750 if (Subtarget.hasBWI()) {
15751 // legal, will go to VPMOVB2M, VPMOVW2M
15752 // Shift packed bytes not supported natively, bitcast to word
15753 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15754 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15755 DAG.getBitcast(ExtVT, In),
15756 DAG.getConstant(ShiftInx, DL, ExtVT));
15757 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15758 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15760 // Use TESTD/Q, extended vector to packed dword/qword.
15761 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15762 "Unexpected vector type.");
15763 unsigned NumElts = InVT.getVectorNumElements();
15764 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15765 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15767 ShiftInx = InVT.getScalarSizeInBits() - 1;
15770 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15771 DAG.getConstant(ShiftInx, DL, InVT));
15772 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15775 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15777 MVT VT = Op.getSimpleValueType();
15778 SDValue In = Op.getOperand(0);
15779 MVT InVT = In.getSimpleValueType();
15781 if (VT == MVT::i1) {
15782 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15783 "Invalid scalar TRUNCATE operation");
15784 if (InVT.getSizeInBits() >= 32)
15786 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15787 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15789 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15790 "Invalid TRUNCATE operation");
15792 if (VT.getVectorElementType() == MVT::i1)
15793 return LowerTruncateVecI1(Op, DAG, Subtarget);
15795 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15796 if (Subtarget.hasAVX512()) {
15797 // word to byte only under BWI
15798 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15799 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15800 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15801 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15804 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15805 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15806 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15809 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15810 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15811 if (Subtarget.hasInt256()) {
15812 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15813 In = DAG.getBitcast(MVT::v8i32, In);
15814 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15815 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15816 DAG.getIntPtrConstant(0, DL));
15819 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15820 DAG.getIntPtrConstant(0, DL));
15821 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15822 DAG.getIntPtrConstant(2, DL));
15823 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15824 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15825 static const int ShufMask[] = {0, 2, 4, 6};
15826 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15829 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15830 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
15831 if (Subtarget.hasInt256()) {
15832 In = DAG.getBitcast(MVT::v32i8, In);
15834 // The PSHUFB mask:
15835 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
15836 -1, -1, -1, -1, -1, -1, -1, -1,
15837 16, 17, 20, 21, 24, 25, 28, 29,
15838 -1, -1, -1, -1, -1, -1, -1, -1 };
15839 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
15840 In = DAG.getBitcast(MVT::v4i64, In);
15842 static const int ShufMask2[] = {0, 2, -1, -1};
15843 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
15844 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15845 DAG.getIntPtrConstant(0, DL));
15846 return DAG.getBitcast(VT, In);
15849 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15850 DAG.getIntPtrConstant(0, DL));
15852 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15853 DAG.getIntPtrConstant(4, DL));
15855 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15856 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15858 // The PSHUFB mask:
15859 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15860 -1, -1, -1, -1, -1, -1, -1, -1};
15862 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
15863 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
15865 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15866 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15868 // The MOVLHPS Mask:
15869 static const int ShufMask2[] = {0, 1, 4, 5};
15870 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15871 return DAG.getBitcast(MVT::v8i16, res);
15874 // Handle truncation of V256 to V128 using shuffles.
15875 if (!VT.is128BitVector() || !InVT.is256BitVector())
15878 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15880 unsigned NumElems = VT.getVectorNumElements();
15881 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15883 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15884 // Prepare truncation shuffle mask
15885 for (unsigned i = 0; i != NumElems; ++i)
15886 MaskVec[i] = i * 2;
15887 In = DAG.getBitcast(NVT, In);
15888 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
15889 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15890 DAG.getIntPtrConstant(0, DL));
15893 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
15894 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15895 MVT VT = Op.getSimpleValueType();
15897 if (VT.isVector()) {
15898 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15899 SDValue Src = Op.getOperand(0);
15901 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15902 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
15903 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15904 DAG.getUNDEF(MVT::v2f32)));
15910 assert(!VT.isVector());
15912 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15913 IsSigned, /*IsReplace=*/ false);
15914 SDValue FIST = Vals.first, StackSlot = Vals.second;
15915 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15916 if (!FIST.getNode())
15919 if (StackSlot.getNode())
15920 // Load the result.
15921 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15923 // The node is the result.
15927 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15929 MVT VT = Op.getSimpleValueType();
15930 SDValue In = Op.getOperand(0);
15931 MVT SVT = In.getSimpleValueType();
15933 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15935 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15936 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15937 In, DAG.getUNDEF(SVT)));
15940 /// The only differences between FABS and FNEG are the mask and the logic op.
15941 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15942 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15943 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15944 "Wrong opcode for lowering FABS or FNEG.");
15946 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15948 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15949 // into an FNABS. We'll lower the FABS after that if it is still in use.
15951 for (SDNode *User : Op->uses())
15952 if (User->getOpcode() == ISD::FNEG)
15956 MVT VT = Op.getSimpleValueType();
15958 bool IsF128 = (VT == MVT::f128);
15960 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15961 // decide if we should generate a 16-byte constant mask when we only need 4 or
15962 // 8 bytes for the scalar case.
15967 if (VT.isVector()) {
15969 EltVT = VT.getVectorElementType();
15970 } else if (IsF128) {
15971 // SSE instructions are used for optimized f128 logical operations.
15972 LogicVT = MVT::f128;
15975 // There are no scalar bitwise logical SSE/AVX instructions, so we
15976 // generate a 16-byte vector constant and logic op even for the scalar case.
15977 // Using a 16-byte mask allows folding the load of the mask with
15978 // the logic op, so it can save (~4 bytes) on code size.
15979 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15983 unsigned EltBits = EltVT.getSizeInBits();
15984 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
15986 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
15987 const fltSemantics &Sem =
15988 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
15989 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15990 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
15992 SDValue Op0 = Op.getOperand(0);
15993 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
15995 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15996 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15998 if (VT.isVector() || IsF128)
15999 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16001 // For the scalar case extend to a 128-bit vector, perform the logic op,
16002 // and extract the scalar result back out.
16003 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16004 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16005 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16006 DAG.getIntPtrConstant(0, dl));
16009 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16010 SDValue Mag = Op.getOperand(0);
16011 SDValue Sign = Op.getOperand(1);
16014 // If the sign operand is smaller, extend it first.
16015 MVT VT = Op.getSimpleValueType();
16016 if (Sign.getSimpleValueType().bitsLT(VT))
16017 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16019 // And if it is bigger, shrink it first.
16020 if (Sign.getSimpleValueType().bitsGT(VT))
16021 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16023 // At this point the operands and the result should have the same
16024 // type, and that won't be f80 since that is not custom lowered.
16025 bool IsF128 = (VT == MVT::f128);
16026 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16027 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16028 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16029 "Unexpected type in LowerFCOPYSIGN");
16031 MVT EltVT = VT.getScalarType();
16032 const fltSemantics &Sem =
16033 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16034 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16036 // Perform all scalar logic operations as 16-byte vectors because there are no
16037 // scalar FP logic instructions in SSE.
16038 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16039 // unnecessary splats, but we might miss load folding opportunities. Should
16040 // this decision be based on OptimizeForSize?
16041 bool IsFakeVector = !VT.isVector() && !IsF128;
16044 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16046 // The mask constants are automatically splatted for vector types.
16047 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16048 SDValue SignMask = DAG.getConstantFP(
16049 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16050 SDValue MagMask = DAG.getConstantFP(
16051 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16053 // First, clear all bits but the sign bit from the second operand (sign).
16055 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16056 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16058 // Next, clear the sign bit from the first operand (magnitude).
16059 // TODO: If we had general constant folding for FP logic ops, this check
16060 // wouldn't be necessary.
16062 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16063 APFloat APF = Op0CN->getValueAPF();
16065 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16067 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16069 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16070 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16073 // OR the magnitude value with the sign bit.
16074 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16075 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16076 DAG.getIntPtrConstant(0, dl));
16079 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16080 SDValue N0 = Op.getOperand(0);
16082 MVT VT = Op.getSimpleValueType();
16084 MVT OpVT = N0.getSimpleValueType();
16085 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16086 "Unexpected type for FGETSIGN");
16088 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16089 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16090 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16091 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16092 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16093 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16097 // Check whether an OR'd tree is PTEST-able.
16098 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16099 SelectionDAG &DAG) {
16100 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16102 if (!Subtarget.hasSSE41())
16105 if (!Op->hasOneUse())
16108 SDNode *N = Op.getNode();
16111 SmallVector<SDValue, 8> Opnds;
16112 DenseMap<SDValue, unsigned> VecInMap;
16113 SmallVector<SDValue, 8> VecIns;
16114 EVT VT = MVT::Other;
16116 // Recognize a special case where a vector is casted into wide integer to
16118 Opnds.push_back(N->getOperand(0));
16119 Opnds.push_back(N->getOperand(1));
16121 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16122 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16123 // BFS traverse all OR'd operands.
16124 if (I->getOpcode() == ISD::OR) {
16125 Opnds.push_back(I->getOperand(0));
16126 Opnds.push_back(I->getOperand(1));
16127 // Re-evaluate the number of nodes to be traversed.
16128 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16132 // Quit if a non-EXTRACT_VECTOR_ELT
16133 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16136 // Quit if without a constant index.
16137 SDValue Idx = I->getOperand(1);
16138 if (!isa<ConstantSDNode>(Idx))
16141 SDValue ExtractedFromVec = I->getOperand(0);
16142 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16143 if (M == VecInMap.end()) {
16144 VT = ExtractedFromVec.getValueType();
16145 // Quit if not 128/256-bit vector.
16146 if (!VT.is128BitVector() && !VT.is256BitVector())
16148 // Quit if not the same type.
16149 if (VecInMap.begin() != VecInMap.end() &&
16150 VT != VecInMap.begin()->first.getValueType())
16152 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16153 VecIns.push_back(ExtractedFromVec);
16155 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16158 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16159 "Not extracted from 128-/256-bit vector.");
16161 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16163 for (DenseMap<SDValue, unsigned>::const_iterator
16164 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16165 // Quit if not all elements are used.
16166 if (I->second != FullMask)
16170 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16172 // Cast all vectors into TestVT for PTEST.
16173 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16174 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16176 // If more than one full vector is evaluated, OR them first before PTEST.
16177 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16178 // Each iteration will OR 2 nodes and append the result until there is only
16179 // 1 node left, i.e. the final OR'd value of all vectors.
16180 SDValue LHS = VecIns[Slot];
16181 SDValue RHS = VecIns[Slot + 1];
16182 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16185 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16188 /// \brief return true if \c Op has a use that doesn't just read flags.
16189 static bool hasNonFlagsUse(SDValue Op) {
16190 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16192 SDNode *User = *UI;
16193 unsigned UOpNo = UI.getOperandNo();
16194 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16195 // Look pass truncate.
16196 UOpNo = User->use_begin().getOperandNo();
16197 User = *User->use_begin();
16200 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16201 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16207 // Emit KTEST instruction for bit vectors on AVX-512
16208 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16209 const X86Subtarget &Subtarget) {
16210 if (Op.getOpcode() == ISD::BITCAST) {
16211 auto hasKTEST = [&](MVT VT) {
16212 unsigned SizeInBits = VT.getSizeInBits();
16213 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16214 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16216 SDValue Op0 = Op.getOperand(0);
16217 MVT Op0VT = Op0.getValueType().getSimpleVT();
16218 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16220 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16225 /// Emit nodes that will be selected as "test Op0,Op0", or something
16227 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16228 SelectionDAG &DAG) const {
16229 if (Op.getValueType() == MVT::i1) {
16230 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16231 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16232 DAG.getConstant(0, dl, MVT::i8));
16234 // CF and OF aren't always set the way we want. Determine which
16235 // of these we need.
16236 bool NeedCF = false;
16237 bool NeedOF = false;
16240 case X86::COND_A: case X86::COND_AE:
16241 case X86::COND_B: case X86::COND_BE:
16244 case X86::COND_G: case X86::COND_GE:
16245 case X86::COND_L: case X86::COND_LE:
16246 case X86::COND_O: case X86::COND_NO: {
16247 // Check if we really need to set the
16248 // Overflow flag. If NoSignedWrap is present
16249 // that is not actually needed.
16250 switch (Op->getOpcode()) {
16255 if (Op.getNode()->getFlags().hasNoSignedWrap())
16264 // See if we can use the EFLAGS value from the operand instead of
16265 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16266 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16267 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16268 // Emit KTEST for bit vectors
16269 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16271 // Emit a CMP with 0, which is the TEST pattern.
16272 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16273 DAG.getConstant(0, dl, Op.getValueType()));
16275 unsigned Opcode = 0;
16276 unsigned NumOperands = 0;
16278 // Truncate operations may prevent the merge of the SETCC instruction
16279 // and the arithmetic instruction before it. Attempt to truncate the operands
16280 // of the arithmetic instruction and use a reduced bit-width instruction.
16281 bool NeedTruncation = false;
16282 SDValue ArithOp = Op;
16283 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16284 SDValue Arith = Op->getOperand(0);
16285 // Both the trunc and the arithmetic op need to have one user each.
16286 if (Arith->hasOneUse())
16287 switch (Arith.getOpcode()) {
16294 NeedTruncation = true;
16300 // Sometimes flags can be set either with an AND or with an SRL/SHL
16301 // instruction. SRL/SHL variant should be preferred for masks longer than this
16303 const int ShiftToAndMaxMaskWidth = 32;
16304 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16306 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16307 // which may be the result of a CAST. We use the variable 'Op', which is the
16308 // non-casted variable when we check for possible users.
16309 switch (ArithOp.getOpcode()) {
16311 // Due to an isel shortcoming, be conservative if this add is likely to be
16312 // selected as part of a load-modify-store instruction. When the root node
16313 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16314 // uses of other nodes in the match, such as the ADD in this case. This
16315 // leads to the ADD being left around and reselected, with the result being
16316 // two adds in the output. Alas, even if none our users are stores, that
16317 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16318 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16319 // climbing the DAG back to the root, and it doesn't seem to be worth the
16321 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16322 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16323 if (UI->getOpcode() != ISD::CopyToReg &&
16324 UI->getOpcode() != ISD::SETCC &&
16325 UI->getOpcode() != ISD::STORE)
16328 if (ConstantSDNode *C =
16329 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16330 // An add of one will be selected as an INC.
16331 if (C->isOne() && !Subtarget.slowIncDec()) {
16332 Opcode = X86ISD::INC;
16337 // An add of negative one (subtract of one) will be selected as a DEC.
16338 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16339 Opcode = X86ISD::DEC;
16345 // Otherwise use a regular EFLAGS-setting add.
16346 Opcode = X86ISD::ADD;
16351 // If we have a constant logical shift that's only used in a comparison
16352 // against zero turn it into an equivalent AND. This allows turning it into
16353 // a TEST instruction later.
16354 if (ZeroCheck && Op->hasOneUse() &&
16355 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16356 EVT VT = Op.getValueType();
16357 unsigned BitWidth = VT.getSizeInBits();
16358 unsigned ShAmt = Op->getConstantOperandVal(1);
16359 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16361 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16362 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16363 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16364 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16366 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16367 DAG.getConstant(Mask, dl, VT));
16372 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16373 // because a TEST instruction will be better. However, AND should be
16374 // preferred if the instruction can be combined into ANDN.
16375 if (!hasNonFlagsUse(Op)) {
16376 SDValue Op0 = ArithOp->getOperand(0);
16377 SDValue Op1 = ArithOp->getOperand(1);
16378 EVT VT = ArithOp.getValueType();
16379 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16380 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16381 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16383 // If we cannot select an ANDN instruction, check if we can replace
16384 // AND+IMM64 with a shift before giving up. This is possible for masks
16385 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16386 if (!isProperAndn) {
16390 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16391 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16395 const APInt &Mask = CN->getAPIntValue();
16396 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16397 break; // Prefer TEST instruction.
16399 unsigned BitWidth = Mask.getBitWidth();
16400 unsigned LeadingOnes = Mask.countLeadingOnes();
16401 unsigned TrailingZeros = Mask.countTrailingZeros();
16403 if (LeadingOnes + TrailingZeros == BitWidth) {
16404 assert(TrailingZeros < VT.getSizeInBits() &&
16405 "Shift amount should be less than the type width");
16406 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16407 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16408 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16412 unsigned LeadingZeros = Mask.countLeadingZeros();
16413 unsigned TrailingOnes = Mask.countTrailingOnes();
16415 if (LeadingZeros + TrailingOnes == BitWidth) {
16416 assert(LeadingZeros < VT.getSizeInBits() &&
16417 "Shift amount should be less than the type width");
16418 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16419 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16420 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16431 // Due to the ISEL shortcoming noted above, be conservative if this op is
16432 // likely to be selected as part of a load-modify-store instruction.
16433 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16434 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16435 if (UI->getOpcode() == ISD::STORE)
16438 // Otherwise use a regular EFLAGS-setting instruction.
16439 switch (ArithOp.getOpcode()) {
16440 default: llvm_unreachable("unexpected operator!");
16441 case ISD::SUB: Opcode = X86ISD::SUB; break;
16442 case ISD::XOR: Opcode = X86ISD::XOR; break;
16443 case ISD::AND: Opcode = X86ISD::AND; break;
16445 if (!NeedTruncation && ZeroCheck) {
16446 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16449 Opcode = X86ISD::OR;
16463 return SDValue(Op.getNode(), 1);
16469 // If we found that truncation is beneficial, perform the truncation and
16471 if (NeedTruncation) {
16472 EVT VT = Op.getValueType();
16473 SDValue WideVal = Op->getOperand(0);
16474 EVT WideVT = WideVal.getValueType();
16475 unsigned ConvertedOp = 0;
16476 // Use a target machine opcode to prevent further DAGCombine
16477 // optimizations that may separate the arithmetic operations
16478 // from the setcc node.
16479 switch (WideVal.getOpcode()) {
16481 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16482 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16483 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16484 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16485 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16489 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16490 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16491 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16492 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16493 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16499 // Emit KTEST for bit vectors
16500 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16503 // Emit a CMP with 0, which is the TEST pattern.
16504 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16505 DAG.getConstant(0, dl, Op.getValueType()));
16507 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16508 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16510 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16511 DAG.ReplaceAllUsesWith(Op, New);
16512 return SDValue(New.getNode(), 1);
16515 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16517 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16518 const SDLoc &dl, SelectionDAG &DAG) const {
16519 if (isNullConstant(Op1))
16520 return EmitTest(Op0, X86CC, dl, DAG);
16522 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16523 "Unexpected comparison operation for MVT::i1 operands");
16525 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16526 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16527 // Only promote the compare up to I32 if it is a 16 bit operation
16528 // with an immediate. 16 bit immediates are to be avoided.
16529 if ((Op0.getValueType() == MVT::i16 &&
16530 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16531 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16532 !Subtarget.isAtom()) {
16533 unsigned ExtendOp =
16534 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16535 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16536 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16538 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16539 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16540 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16542 return SDValue(Sub.getNode(), 1);
16544 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16547 /// Convert a comparison if required by the subtarget.
16548 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16549 SelectionDAG &DAG) const {
16550 // If the subtarget does not support the FUCOMI instruction, floating-point
16551 // comparisons have to be converted.
16552 if (Subtarget.hasCMov() ||
16553 Cmp.getOpcode() != X86ISD::CMP ||
16554 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16555 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16558 // The instruction selector will select an FUCOM instruction instead of
16559 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16560 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16561 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16563 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16564 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16565 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16566 DAG.getConstant(8, dl, MVT::i8));
16567 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16569 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16570 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16571 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16574 /// Check if replacement of SQRT with RSQRT should be disabled.
16575 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16576 EVT VT = Op.getValueType();
16578 // We never want to use both SQRT and RSQRT instructions for the same input.
16579 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16583 return Subtarget.hasFastVectorFSQRT();
16584 return Subtarget.hasFastScalarFSQRT();
16587 /// The minimum architected relative accuracy is 2^-12. We need one
16588 /// Newton-Raphson step to have a good float result (24 bits of precision).
16589 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16590 SelectionDAG &DAG, int Enabled,
16591 int &RefinementSteps,
16592 bool &UseOneConstNR,
16593 bool Reciprocal) const {
16594 EVT VT = Op.getValueType();
16596 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16597 // TODO: Add support for AVX512 (v16f32).
16598 // It is likely not profitable to do this for f64 because a double-precision
16599 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16600 // instructions: convert to single, rsqrtss, convert back to double, refine
16601 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16602 // along with FMA, this could be a throughput win.
16603 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16604 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16605 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16606 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16607 RefinementSteps = 1;
16609 UseOneConstNR = false;
16610 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16615 /// The minimum architected relative accuracy is 2^-12. We need one
16616 /// Newton-Raphson step to have a good float result (24 bits of precision).
16617 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16619 int &RefinementSteps) const {
16620 EVT VT = Op.getValueType();
16622 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16623 // TODO: Add support for AVX512 (v16f32).
16624 // It is likely not profitable to do this for f64 because a double-precision
16625 // reciprocal estimate with refinement on x86 prior to FMA requires
16626 // 15 instructions: convert to single, rcpss, convert back to double, refine
16627 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16628 // along with FMA, this could be a throughput win.
16630 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16631 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16632 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16633 // Enable estimate codegen with 1 refinement step for vector division.
16634 // Scalar division estimates are disabled because they break too much
16635 // real-world code. These defaults are intended to match GCC behavior.
16636 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16639 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16640 RefinementSteps = 1;
16642 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16647 /// If we have at least two divisions that use the same divisor, convert to
16648 /// multiplication by a reciprocal. This may need to be adjusted for a given
16649 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16650 /// This is because we still need one division to calculate the reciprocal and
16651 /// then we need two multiplies by that reciprocal as replacements for the
16652 /// original divisions.
16653 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16657 /// Helper for creating a X86ISD::SETCC node.
16658 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16659 SelectionDAG &DAG) {
16660 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16661 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16664 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16665 /// according to equal/not-equal condition code \p CC.
16666 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16667 const SDLoc &dl, SelectionDAG &DAG) {
16668 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16669 // instruction. Since the shift amount is in-range-or-undefined, we know
16670 // that doing a bittest on the i32 value is ok. We extend to i32 because
16671 // the encoding for the i16 version is larger than the i32 version.
16672 // Also promote i16 to i32 for performance / code size reason.
16673 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16674 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16676 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16677 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16678 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16679 // known to be zero.
16680 if (Src.getValueType() == MVT::i64 &&
16681 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16682 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16684 // If the operand types disagree, extend the shift amount to match. Since
16685 // BT ignores high bits (like shifts) we can use anyextend.
16686 if (Src.getValueType() != BitNo.getValueType())
16687 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16689 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16690 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16691 return getSETCC(Cond, BT, dl , DAG);
16694 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16695 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16696 const SDLoc &dl, SelectionDAG &DAG) {
16697 SDValue Op0 = And.getOperand(0);
16698 SDValue Op1 = And.getOperand(1);
16699 if (Op0.getOpcode() == ISD::TRUNCATE)
16700 Op0 = Op0.getOperand(0);
16701 if (Op1.getOpcode() == ISD::TRUNCATE)
16702 Op1 = Op1.getOperand(0);
16705 if (Op1.getOpcode() == ISD::SHL)
16706 std::swap(Op0, Op1);
16707 if (Op0.getOpcode() == ISD::SHL) {
16708 if (isOneConstant(Op0.getOperand(0))) {
16709 // If we looked past a truncate, check that it's only truncating away
16711 unsigned BitWidth = Op0.getValueSizeInBits();
16712 unsigned AndBitWidth = And.getValueSizeInBits();
16713 if (BitWidth > AndBitWidth) {
16715 DAG.computeKnownBits(Op0, Known);
16716 if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth)
16720 RHS = Op0.getOperand(1);
16722 } else if (Op1.getOpcode() == ISD::Constant) {
16723 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16724 uint64_t AndRHSVal = AndRHS->getZExtValue();
16725 SDValue AndLHS = Op0;
16727 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16728 LHS = AndLHS.getOperand(0);
16729 RHS = AndLHS.getOperand(1);
16732 // Use BT if the immediate can't be encoded in a TEST instruction.
16733 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16735 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16740 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16745 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16746 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16747 const SDLoc &dl, SelectionDAG &DAG) {
16749 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16750 "Expected TRUNCATE to i1 node");
16752 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16755 SDValue ShiftRight = Op.getOperand(0);
16756 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16760 /// Result of 'and' or 'trunc to i1' is compared against zero.
16761 /// Change to a BT node if possible.
16762 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16763 const SDLoc &dl, SelectionDAG &DAG) const {
16764 if (Op.getOpcode() == ISD::AND)
16765 return LowerAndToBT(Op, CC, dl, DAG);
16766 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16767 return LowerTruncateToBT(Op, CC, dl, DAG);
16771 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16773 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16778 // SSE Condition code mapping:
16787 switch (SetCCOpcode) {
16788 default: llvm_unreachable("Unexpected SETCC condition");
16790 case ISD::SETEQ: SSECC = 0; break;
16792 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16794 case ISD::SETOLT: SSECC = 1; break;
16796 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16798 case ISD::SETOLE: SSECC = 2; break;
16799 case ISD::SETUO: SSECC = 3; break;
16801 case ISD::SETNE: SSECC = 4; break;
16802 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16803 case ISD::SETUGE: SSECC = 5; break;
16804 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16805 case ISD::SETUGT: SSECC = 6; break;
16806 case ISD::SETO: SSECC = 7; break;
16808 case ISD::SETONE: SSECC = 8; break;
16811 std::swap(Op0, Op1);
16816 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16817 /// concatenate the result back.
16818 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16819 MVT VT = Op.getSimpleValueType();
16821 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16822 "Unsupported value type for operation");
16824 unsigned NumElems = VT.getVectorNumElements();
16826 SDValue CC = Op.getOperand(2);
16828 // Extract the LHS vectors
16829 SDValue LHS = Op.getOperand(0);
16830 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16831 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16833 // Extract the RHS vectors
16834 SDValue RHS = Op.getOperand(1);
16835 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16836 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16838 // Issue the operation on the smaller types and concatenate the result back
16839 MVT EltVT = VT.getVectorElementType();
16840 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16841 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16842 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16843 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16846 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16847 SDValue Op0 = Op.getOperand(0);
16848 SDValue Op1 = Op.getOperand(1);
16849 SDValue CC = Op.getOperand(2);
16850 MVT VT = Op.getSimpleValueType();
16853 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16854 "Unexpected type for boolean compare operation");
16855 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16856 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16857 DAG.getConstant(-1, dl, VT));
16858 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16859 DAG.getConstant(-1, dl, VT));
16860 switch (SetCCOpcode) {
16861 default: llvm_unreachable("Unexpected SETCC condition");
16863 // (x == y) -> ~(x ^ y)
16864 return DAG.getNode(ISD::XOR, dl, VT,
16865 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16866 DAG.getConstant(-1, dl, VT));
16868 // (x != y) -> (x ^ y)
16869 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16872 // (x > y) -> (x & ~y)
16873 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16876 // (x < y) -> (~x & y)
16877 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16880 // (x <= y) -> (~x | y)
16881 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16884 // (x >=y) -> (x | ~y)
16885 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16889 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16891 SDValue Op0 = Op.getOperand(0);
16892 SDValue Op1 = Op.getOperand(1);
16893 SDValue CC = Op.getOperand(2);
16894 MVT VT = Op.getSimpleValueType();
16897 assert(VT.getVectorElementType() == MVT::i1 &&
16898 "Cannot set masked compare for this operation");
16900 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16902 bool Unsigned = false;
16905 switch (SetCCOpcode) {
16906 default: llvm_unreachable("Unexpected SETCC condition");
16907 case ISD::SETNE: SSECC = 4; break;
16908 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16909 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16910 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16911 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16912 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16913 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16914 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16915 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16916 case ISD::SETLE: SSECC = 2; break;
16920 std::swap(Op0, Op1);
16922 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16923 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16924 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16925 DAG.getConstant(SSECC, dl, MVT::i8));
16928 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16929 /// operand \p Op1. If non-trivial (for example because it's not constant)
16930 /// return an empty value.
16931 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16932 SelectionDAG &DAG) {
16933 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16937 MVT VT = Op1.getSimpleValueType();
16938 MVT EVT = VT.getVectorElementType();
16939 unsigned n = VT.getVectorNumElements();
16940 SmallVector<SDValue, 8> ULTOp1;
16942 for (unsigned i = 0; i < n; ++i) {
16943 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16944 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16947 // Avoid underflow.
16948 APInt Val = Elt->getAPIntValue();
16952 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16955 return DAG.getBuildVector(VT, dl, ULTOp1);
16958 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16959 SelectionDAG &DAG) {
16960 SDValue Op0 = Op.getOperand(0);
16961 SDValue Op1 = Op.getOperand(1);
16962 SDValue CC = Op.getOperand(2);
16963 MVT VT = Op.getSimpleValueType();
16964 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16965 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
16970 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
16971 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
16975 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
16976 assert(VT.getVectorNumElements() <= 16);
16977 Opc = X86ISD::CMPM;
16979 Opc = X86ISD::CMPP;
16980 // The SSE/AVX packed FP comparison nodes are defined with a
16981 // floating-point vector result that matches the operand type. This allows
16982 // them to work with an SSE1 target (integer vector types are not legal).
16983 VT = Op0.getSimpleValueType();
16986 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
16987 // emit two comparisons and a logic op to tie them together.
16988 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
16991 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
16993 // LLVM predicate is SETUEQ or SETONE.
16995 unsigned CombineOpc;
16996 if (SetCCOpcode == ISD::SETUEQ) {
16999 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17000 static_cast<unsigned>(ISD::OR);
17002 assert(SetCCOpcode == ISD::SETONE);
17005 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17006 static_cast<unsigned>(ISD::AND);
17009 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17010 DAG.getConstant(CC0, dl, MVT::i8));
17011 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17012 DAG.getConstant(CC1, dl, MVT::i8));
17013 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17015 // Handle all other FP comparisons here.
17016 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17017 DAG.getConstant(SSECC, dl, MVT::i8));
17020 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17021 // result type of SETCC. The bitcast is expected to be optimized away
17022 // during combining/isel.
17023 if (Opc == X86ISD::CMPP)
17024 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17029 MVT VTOp0 = Op0.getSimpleValueType();
17030 assert(VTOp0 == Op1.getSimpleValueType() &&
17031 "Expected operands with same type!");
17032 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17033 "Invalid number of packed elements for source and destination!");
17035 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17036 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17037 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17038 // legalizer firstly checks if the first operand in input to the setcc has
17039 // a legal type. If so, then it promotes the return type to that same type.
17040 // Otherwise, the return type is promoted to the 'next legal type' which,
17041 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17043 // We reach this code only if the following two conditions are met:
17044 // 1. Both return type and operand type have been promoted to wider types
17045 // by the type legalizer.
17046 // 2. The original operand type has been promoted to a 256-bit vector.
17048 // Note that condition 2. only applies for AVX targets.
17049 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
17050 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17053 // The non-AVX512 code below works under the assumption that source and
17054 // destination types are the same.
17055 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17056 "Value types for source and destination must be the same!");
17058 // Break 256-bit integer vector compare into smaller ones.
17059 if (VT.is256BitVector() && !Subtarget.hasInt256())
17060 return Lower256IntVSETCC(Op, DAG);
17062 // Operands are boolean (vectors of i1)
17063 MVT OpVT = Op1.getSimpleValueType();
17064 if (OpVT.getVectorElementType() == MVT::i1)
17065 return LowerBoolVSETCC_AVX512(Op, DAG);
17067 // The result is boolean, but operands are int/float
17068 if (VT.getVectorElementType() == MVT::i1) {
17069 // In AVX-512 architecture setcc returns mask with i1 elements,
17070 // But there is no compare instruction for i8 and i16 elements in KNL.
17071 // In this case use SSE compare
17072 bool UseAVX512Inst =
17073 (OpVT.is512BitVector() ||
17074 OpVT.getScalarSizeInBits() >= 32 ||
17075 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17078 return LowerIntVSETCC_AVX512(Op, DAG);
17080 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17081 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17084 // Lower using XOP integer comparisons.
17085 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17086 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17087 // Translate compare code to XOP PCOM compare mode.
17088 unsigned CmpMode = 0;
17089 switch (SetCCOpcode) {
17090 default: llvm_unreachable("Unexpected SETCC condition");
17092 case ISD::SETLT: CmpMode = 0x00; break;
17094 case ISD::SETLE: CmpMode = 0x01; break;
17096 case ISD::SETGT: CmpMode = 0x02; break;
17098 case ISD::SETGE: CmpMode = 0x03; break;
17099 case ISD::SETEQ: CmpMode = 0x04; break;
17100 case ISD::SETNE: CmpMode = 0x05; break;
17103 // Are we comparing unsigned or signed integers?
17104 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
17105 ? X86ISD::VPCOMU : X86ISD::VPCOM;
17107 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17108 DAG.getConstant(CmpMode, dl, MVT::i8));
17111 // We are handling one of the integer comparisons here. Since SSE only has
17112 // GT and EQ comparisons for integer, swapping operands and multiple
17113 // operations may be required for some comparisons.
17115 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
17116 bool Subus = false;
17118 switch (SetCCOpcode) {
17119 default: llvm_unreachable("Unexpected SETCC condition");
17120 case ISD::SETNE: Invert = true;
17121 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
17122 case ISD::SETLT: Swap = true;
17123 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
17124 case ISD::SETGE: Swap = true;
17125 case ISD::SETLE: Opc = X86ISD::PCMPGT;
17126 Invert = true; break;
17127 case ISD::SETULT: Swap = true;
17128 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
17129 FlipSigns = true; break;
17130 case ISD::SETUGE: Swap = true;
17131 case ISD::SETULE: Opc = X86ISD::PCMPGT;
17132 FlipSigns = true; Invert = true; break;
17135 // Special case: Use min/max operations for SETULE/SETUGE
17136 MVT VET = VT.getVectorElementType();
17138 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
17139 || (Subtarget.hasSSE2() && (VET == MVT::i8));
17142 switch (SetCCOpcode) {
17144 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17145 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17148 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
17151 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17152 if (!MinMax && hasSubus) {
17153 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17155 // t = psubus Op0, Op1
17156 // pcmpeq t, <0..0>
17157 switch (SetCCOpcode) {
17159 case ISD::SETULT: {
17160 // If the comparison is against a constant we can turn this into a
17161 // setule. With psubus, setule does not require a swap. This is
17162 // beneficial because the constant in the register is no longer
17163 // destructed as the destination so it can be hoisted out of a loop.
17164 // Only do this pre-AVX since vpcmp* is no longer destructive.
17165 if (Subtarget.hasAVX())
17167 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17169 Subus = true; Invert = false; Swap = false;
17173 // Psubus is better than flip-sign because it requires no inversion.
17174 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17175 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17179 Opc = X86ISD::SUBUS;
17185 std::swap(Op0, Op1);
17187 // Check that the operation in question is available (most are plain SSE2,
17188 // but PCMPGTQ and PCMPEQQ have different requirements).
17189 if (VT == MVT::v2i64) {
17190 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17191 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17193 // First cast everything to the right type.
17194 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17195 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17197 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17198 // bits of the inputs before performing those operations. The lower
17199 // compare is always unsigned.
17202 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17204 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17205 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17206 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17208 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17209 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17211 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17212 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17213 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17215 // Create masks for only the low parts/high parts of the 64 bit integers.
17216 static const int MaskHi[] = { 1, 1, 3, 3 };
17217 static const int MaskLo[] = { 0, 0, 2, 2 };
17218 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17219 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17220 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17222 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17223 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17226 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17228 return DAG.getBitcast(VT, Result);
17231 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17232 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17233 // pcmpeqd + pshufd + pand.
17234 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17236 // First cast everything to the right type.
17237 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17238 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17241 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17243 // Make sure the lower and upper halves are both all-ones.
17244 static const int Mask[] = { 1, 0, 3, 2 };
17245 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17246 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17249 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17251 return DAG.getBitcast(VT, Result);
17255 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17256 // bits of the inputs before performing those operations.
17258 MVT EltVT = VT.getVectorElementType();
17259 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17261 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17262 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17265 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17267 // If the logical-not of the result is required, perform that now.
17269 Result = DAG.getNOT(dl, Result, VT);
17272 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17275 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17276 getZeroVector(VT, Subtarget, DAG, dl));
17281 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17283 MVT VT = Op.getSimpleValueType();
17285 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17287 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
17288 && "SetCC type must be 8-bit or 1-bit integer");
17289 SDValue Op0 = Op.getOperand(0);
17290 SDValue Op1 = Op.getOperand(1);
17292 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17294 // Optimize to BT if possible.
17295 // Lower (X & (1 << N)) == 0 to BT(X, N).
17296 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17297 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17298 // Lower (trunc (X >> N) to i1) to BT(X, N).
17299 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17300 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17301 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17303 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17308 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17310 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17311 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17313 // If the input is a setcc, then reuse the input setcc or use a new one with
17314 // the inverted condition.
17315 if (Op0.getOpcode() == X86ISD::SETCC) {
17316 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17317 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17321 CCode = X86::GetOppositeBranchCondition(CCode);
17322 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17324 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17328 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17329 if (isOneConstant(Op1)) {
17330 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17331 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17333 if (!isNullConstant(Op1)) {
17334 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17335 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17339 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17340 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17341 if (X86CC == X86::COND_INVALID)
17344 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17345 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17346 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17348 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17352 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
17353 SDValue LHS = Op.getOperand(0);
17354 SDValue RHS = Op.getOperand(1);
17355 SDValue Carry = Op.getOperand(2);
17356 SDValue Cond = Op.getOperand(3);
17359 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
17360 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17362 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
17363 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17364 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
17365 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17366 if (Op.getSimpleValueType() == MVT::i1)
17367 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17371 /// Return true if opcode is a X86 logical comparison.
17372 static bool isX86LogicalCmp(SDValue Op) {
17373 unsigned Opc = Op.getOpcode();
17374 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17375 Opc == X86ISD::SAHF)
17377 if (Op.getResNo() == 1 &&
17378 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17379 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17380 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17381 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17384 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17390 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17391 if (V.getOpcode() != ISD::TRUNCATE)
17394 SDValue VOp0 = V.getOperand(0);
17395 unsigned InBits = VOp0.getValueSizeInBits();
17396 unsigned Bits = V.getValueSizeInBits();
17397 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17400 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17401 bool AddTest = true;
17402 SDValue Cond = Op.getOperand(0);
17403 SDValue Op1 = Op.getOperand(1);
17404 SDValue Op2 = Op.getOperand(2);
17406 MVT VT = Op1.getSimpleValueType();
17409 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17410 // are available or VBLENDV if AVX is available.
17411 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17412 if (Cond.getOpcode() == ISD::SETCC &&
17413 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17414 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17415 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17416 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17417 int SSECC = translateX86FSETCC(
17418 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17421 if (Subtarget.hasAVX512()) {
17422 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
17423 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17424 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17425 DL, VT, Cmp, Op1, Op2);
17428 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17429 DAG.getConstant(SSECC, DL, MVT::i8));
17431 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17432 // of 3 logic instructions for size savings and potentially speed.
17433 // Unfortunately, there is no scalar form of VBLENDV.
17435 // If either operand is a constant, don't try this. We can expect to
17436 // optimize away at least one of the logic instructions later in that
17437 // case, so that sequence would be faster than a variable blend.
17439 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17440 // uses XMM0 as the selection register. That may need just as many
17441 // instructions as the AND/ANDN/OR sequence due to register moves, so
17444 if (Subtarget.hasAVX() &&
17445 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17447 // Convert to vectors, do a VSELECT, and convert back to scalar.
17448 // All of the conversions should be optimized away.
17450 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17451 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17452 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17453 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17455 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17456 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17458 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
17460 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17461 VSel, DAG.getIntPtrConstant(0, DL));
17463 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17464 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17465 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17469 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17470 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
17471 Subtarget.hasAVX512())
17472 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
17474 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17476 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17477 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17478 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17479 Op1Scalar = Op1.getOperand(0);
17481 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17482 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17483 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17484 Op2Scalar = Op2.getOperand(0);
17485 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17486 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
17487 Op1Scalar.getValueType(),
17488 Cond, Op1Scalar, Op2Scalar);
17489 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17490 return DAG.getBitcast(VT, newSelect);
17491 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17492 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17493 DAG.getIntPtrConstant(0, DL));
17497 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17498 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17499 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17500 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17501 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17502 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17503 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
17505 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17508 if (Cond.getOpcode() == ISD::SETCC) {
17509 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17511 // If the condition was updated, it's possible that the operands of the
17512 // select were also updated (for example, EmitTest has a RAUW). Refresh
17513 // the local references to the select operands in case they got stale.
17514 Op1 = Op.getOperand(1);
17515 Op2 = Op.getOperand(2);
17519 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17520 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17521 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17522 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17523 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17524 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17525 if (Cond.getOpcode() == X86ISD::SETCC &&
17526 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17527 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17528 SDValue Cmp = Cond.getOperand(1);
17529 unsigned CondCode =
17530 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17532 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17533 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17534 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17536 SDValue CmpOp0 = Cmp.getOperand(0);
17537 // Apply further optimizations for special cases
17538 // (select (x != 0), -1, 0) -> neg & sbb
17539 // (select (x == 0), 0, -1) -> neg & sbb
17540 if (isNullConstant(Y) &&
17541 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17542 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17543 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17544 DAG.getConstant(0, DL,
17545 CmpOp0.getValueType()),
17547 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17548 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17549 SDValue(Neg.getNode(), 1));
17553 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17554 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17555 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17557 SDValue Res = // Res = 0 or -1.
17558 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17559 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17561 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17562 Res = DAG.getNOT(DL, Res, Res.getValueType());
17564 if (!isNullConstant(Op2))
17565 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17567 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17568 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17569 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17570 SDValue CmpOp0 = Cmp.getOperand(0);
17571 SDValue Src1, Src2;
17572 // true if Op2 is XOR or OR operator and one of its operands
17574 // ( a , a op b) || ( b , a op b)
17575 auto isOrXorPattern = [&]() {
17576 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17577 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17579 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17586 if (isOrXorPattern()) {
17588 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17589 // we need mask of all zeros or ones with same size of the other
17591 if (CmpSz > VT.getSizeInBits())
17592 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17593 else if (CmpSz < VT.getSizeInBits())
17594 Neg = DAG.getNode(ISD::AND, DL, VT,
17595 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17596 DAG.getConstant(1, DL, VT));
17599 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17600 Neg); // -(and (x, 0x1))
17601 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17602 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17607 // Look past (and (setcc_carry (cmp ...)), 1).
17608 if (Cond.getOpcode() == ISD::AND &&
17609 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17610 isOneConstant(Cond.getOperand(1)))
17611 Cond = Cond.getOperand(0);
17613 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17614 // setting operand in place of the X86ISD::SETCC.
17615 unsigned CondOpcode = Cond.getOpcode();
17616 if (CondOpcode == X86ISD::SETCC ||
17617 CondOpcode == X86ISD::SETCC_CARRY) {
17618 CC = Cond.getOperand(0);
17620 SDValue Cmp = Cond.getOperand(1);
17621 unsigned Opc = Cmp.getOpcode();
17622 MVT VT = Op.getSimpleValueType();
17624 bool IllegalFPCMov = false;
17625 if (VT.isFloatingPoint() && !VT.isVector() &&
17626 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17627 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17629 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17630 Opc == X86ISD::BT) { // FIXME
17634 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17635 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17636 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17637 Cond.getOperand(0).getValueType() != MVT::i8)) {
17638 SDValue LHS = Cond.getOperand(0);
17639 SDValue RHS = Cond.getOperand(1);
17640 unsigned X86Opcode;
17643 switch (CondOpcode) {
17644 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17645 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17646 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17647 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17648 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17649 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17650 default: llvm_unreachable("unexpected overflowing operator");
17652 if (CondOpcode == ISD::UMULO)
17653 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17656 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17658 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17660 if (CondOpcode == ISD::UMULO)
17661 Cond = X86Op.getValue(2);
17663 Cond = X86Op.getValue(1);
17665 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17670 // Look past the truncate if the high bits are known zero.
17671 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17672 Cond = Cond.getOperand(0);
17674 // We know the result of AND is compared against zero. Try to match
17676 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17677 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17678 CC = NewSetCC.getOperand(0);
17679 Cond = NewSetCC.getOperand(1);
17686 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17687 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17690 // a < b ? -1 : 0 -> RES = ~setcc_carry
17691 // a < b ? 0 : -1 -> RES = setcc_carry
17692 // a >= b ? -1 : 0 -> RES = setcc_carry
17693 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17694 if (Cond.getOpcode() == X86ISD::SUB) {
17695 Cond = ConvertCmpIfNecessary(Cond, DAG);
17696 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17698 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17699 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17700 (isNullConstant(Op1) || isNullConstant(Op2))) {
17701 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17702 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17704 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17705 return DAG.getNOT(DL, Res, Res.getValueType());
17710 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17711 // widen the cmov and push the truncate through. This avoids introducing a new
17712 // branch during isel and doesn't add any extensions.
17713 if (Op.getValueType() == MVT::i8 &&
17714 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17715 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17716 if (T1.getValueType() == T2.getValueType() &&
17717 // Blacklist CopyFromReg to avoid partial register stalls.
17718 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17719 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17720 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17721 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17725 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17726 // condition is true.
17727 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17728 SDValue Ops[] = { Op2, Op1, CC, Cond };
17729 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17732 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17733 const X86Subtarget &Subtarget,
17734 SelectionDAG &DAG) {
17735 MVT VT = Op->getSimpleValueType(0);
17736 SDValue In = Op->getOperand(0);
17737 MVT InVT = In.getSimpleValueType();
17738 MVT VTElt = VT.getVectorElementType();
17739 MVT InVTElt = InVT.getVectorElementType();
17743 if ((InVTElt == MVT::i1) &&
17744 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17746 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17748 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17750 unsigned NumElts = VT.getVectorNumElements();
17752 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17753 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17754 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17755 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17756 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17759 if (InVTElt != MVT::i1)
17763 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17764 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17767 if (Subtarget.hasDQI()) {
17768 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17769 assert(!VT.is512BitVector() && "Unexpected vector type");
17771 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17772 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17773 V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17778 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17781 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17782 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17783 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17784 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17785 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17786 const X86Subtarget &Subtarget,
17787 SelectionDAG &DAG) {
17788 SDValue In = Op->getOperand(0);
17789 MVT VT = Op->getSimpleValueType(0);
17790 MVT InVT = In.getSimpleValueType();
17791 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17793 MVT SVT = VT.getVectorElementType();
17794 MVT InSVT = InVT.getVectorElementType();
17795 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17797 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17799 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17801 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17802 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17803 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17808 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17809 // For 512-bit vectors, we need 128-bits or 256-bits.
17810 if (VT.getSizeInBits() > 128) {
17811 // Input needs to be at least the same number of elements as output, and
17812 // at least 128-bits.
17813 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17814 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17817 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17818 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17820 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17821 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17822 // need to be handled here for 256/512-bit results.
17823 if (Subtarget.hasInt256()) {
17824 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17825 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17826 X86ISD::VSEXT : X86ISD::VZEXT;
17827 return DAG.getNode(ExtOpc, dl, VT, In);
17830 // We should only get here for sign extend.
17831 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17832 "Unexpected opcode!");
17834 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17838 // As SRAI is only available on i16/i32 types, we expand only up to i32
17839 // and handle i64 separately.
17840 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17841 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17842 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17843 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17844 Curr = DAG.getBitcast(CurrVT, Curr);
17847 SDValue SignExt = Curr;
17848 if (CurrVT != InVT) {
17849 unsigned SignExtShift =
17850 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17851 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17852 DAG.getConstant(SignExtShift, dl, MVT::i8));
17858 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17859 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17860 DAG.getConstant(31, dl, MVT::i8));
17861 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17862 return DAG.getBitcast(VT, Ext);
17868 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17869 SelectionDAG &DAG) {
17870 MVT VT = Op->getSimpleValueType(0);
17871 SDValue In = Op->getOperand(0);
17872 MVT InVT = In.getSimpleValueType();
17875 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17876 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17878 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17879 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17880 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17883 if (Subtarget.hasInt256())
17884 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17886 // Optimize vectors in AVX mode
17887 // Sign extend v8i16 to v8i32 and
17890 // Divide input vector into two parts
17891 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17892 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17893 // concat the vectors to original VT
17895 unsigned NumElems = InVT.getVectorNumElements();
17896 SDValue Undef = DAG.getUNDEF(InVT);
17898 SmallVector<int,8> ShufMask1(NumElems, -1);
17899 for (unsigned i = 0; i != NumElems/2; ++i)
17902 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17904 SmallVector<int,8> ShufMask2(NumElems, -1);
17905 for (unsigned i = 0; i != NumElems/2; ++i)
17906 ShufMask2[i] = i + NumElems/2;
17908 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17910 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17911 VT.getVectorNumElements() / 2);
17913 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
17914 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
17916 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17919 // Lower truncating store. We need a special lowering to vXi1 vectors
17920 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17921 SelectionDAG &DAG) {
17922 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17924 EVT MemVT = St->getMemoryVT();
17925 assert(St->isTruncatingStore() && "We only custom truncating store.");
17926 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17927 "Expected truncstore of i1 vector");
17929 SDValue Op = St->getValue();
17930 MVT OpVT = Op.getValueType().getSimpleVT();
17931 unsigned NumElts = OpVT.getVectorNumElements();
17932 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17934 // Truncate and store - everything is legal
17935 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17936 if (MemVT.getSizeInBits() < 8)
17937 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17938 DAG.getUNDEF(MVT::v8i1), Op,
17939 DAG.getIntPtrConstant(0, dl));
17940 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17941 St->getMemOperand());
17944 // A subset, assume that we have only AVX-512F
17945 if (NumElts <= 8) {
17947 // Extend to 8-elts vector
17948 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17949 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17950 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17952 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17953 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17954 St->getMemOperand());
17957 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17958 // Divide the vector into 2 parts and store each part separately
17959 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17960 DAG.getIntPtrConstant(0, dl));
17961 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
17962 SDValue BasePtr = St->getBasePtr();
17963 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
17964 St->getMemOperand());
17965 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17966 DAG.getIntPtrConstant(16, dl));
17967 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
17969 SDValue BasePtrHi =
17970 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17971 DAG.getConstant(2, dl, BasePtr.getValueType()));
17973 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
17974 BasePtrHi, St->getMemOperand());
17975 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
17978 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
17979 const X86Subtarget &Subtarget,
17980 SelectionDAG &DAG) {
17982 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17984 EVT MemVT = Ld->getMemoryVT();
17985 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
17986 "Expected i1 vector load");
17987 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
17988 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17989 MVT VT = Op.getValueType().getSimpleVT();
17990 unsigned NumElts = VT.getVectorNumElements();
17992 if ((Subtarget.hasBWI() && NumElts >= 32) ||
17993 (Subtarget.hasDQI() && NumElts < 16) ||
17995 // Load and extend - everything is legal
17997 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
17999 Ld->getMemOperand());
18000 // Replace chain users with the new chain.
18001 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18002 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18003 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18004 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18006 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18007 DAG.getIntPtrConstant(0, dl));
18009 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18011 Ld->getMemOperand());
18012 // Replace chain users with the new chain.
18013 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18014 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18016 // Finally, do a normal sign-extend to the desired register.
18017 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18020 if (NumElts <= 8) {
18021 // A subset, assume that we have only AVX-512F
18022 unsigned NumBitsToLoad = 8;
18023 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18024 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18026 Ld->getMemOperand());
18027 // Replace chain users with the new chain.
18028 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18029 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18031 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18032 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18035 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18037 // we should take care to v4i1 and v2i1
18039 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18040 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18041 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18042 DAG.getIntPtrConstant(0, dl));
18045 assert(VT == MVT::v32i8 && "Unexpected extload type");
18047 SmallVector<SDValue, 2> Chains;
18049 SDValue BasePtr = Ld->getBasePtr();
18050 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18052 Ld->getMemOperand());
18053 Chains.push_back(LoadLo.getValue(1));
18055 SDValue BasePtrHi =
18056 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18057 DAG.getConstant(2, dl, BasePtr.getValueType()));
18059 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18061 Ld->getMemOperand());
18062 Chains.push_back(LoadHi.getValue(1));
18063 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18064 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18066 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18067 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18068 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18071 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18072 // may emit an illegal shuffle but the expansion is still better than scalar
18073 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18074 // we'll emit a shuffle and a arithmetic shift.
18075 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18076 // TODO: It is possible to support ZExt by zeroing the undef values during
18077 // the shuffle phase or after the shuffle.
18078 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18079 SelectionDAG &DAG) {
18080 MVT RegVT = Op.getSimpleValueType();
18081 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18082 assert(RegVT.isInteger() &&
18083 "We only custom lower integer vector sext loads.");
18085 // Nothing useful we can do without SSE2 shuffles.
18086 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18088 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18090 EVT MemVT = Ld->getMemoryVT();
18091 if (MemVT.getScalarType() == MVT::i1)
18092 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18094 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18095 unsigned RegSz = RegVT.getSizeInBits();
18097 ISD::LoadExtType Ext = Ld->getExtensionType();
18099 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18100 && "Only anyext and sext are currently implemented.");
18101 assert(MemVT != RegVT && "Cannot extend to the same type");
18102 assert(MemVT.isVector() && "Must load a vector from memory");
18104 unsigned NumElems = RegVT.getVectorNumElements();
18105 unsigned MemSz = MemVT.getSizeInBits();
18106 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18108 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18109 // The only way in which we have a legal 256-bit vector result but not the
18110 // integer 256-bit operations needed to directly lower a sextload is if we
18111 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18112 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18113 // correctly legalized. We do this late to allow the canonical form of
18114 // sextload to persist throughout the rest of the DAG combiner -- it wants
18115 // to fold together any extensions it can, and so will fuse a sign_extend
18116 // of an sextload into a sextload targeting a wider value.
18118 if (MemSz == 128) {
18119 // Just switch this to a normal load.
18120 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18121 "it must be a legal 128-bit vector "
18123 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18124 Ld->getPointerInfo(), Ld->getAlignment(),
18125 Ld->getMemOperand()->getFlags());
18127 assert(MemSz < 128 &&
18128 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18129 // Do an sext load to a 128-bit vector type. We want to use the same
18130 // number of elements, but elements half as wide. This will end up being
18131 // recursively lowered by this routine, but will succeed as we definitely
18132 // have all the necessary features if we're using AVX1.
18134 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18135 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18137 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18138 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18139 Ld->getMemOperand()->getFlags());
18142 // Replace chain users with the new chain.
18143 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18144 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18146 // Finally, do a normal sign-extend to the desired register.
18147 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18150 // All sizes must be a power of two.
18151 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18152 "Non-power-of-two elements are not custom lowered!");
18154 // Attempt to load the original value using scalar loads.
18155 // Find the largest scalar type that divides the total loaded size.
18156 MVT SclrLoadTy = MVT::i8;
18157 for (MVT Tp : MVT::integer_valuetypes()) {
18158 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18163 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18164 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18166 SclrLoadTy = MVT::f64;
18168 // Calculate the number of scalar loads that we need to perform
18169 // in order to load our vector from memory.
18170 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18172 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18173 "Can only lower sext loads with a single scalar load!");
18175 unsigned loadRegZize = RegSz;
18176 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18179 // Represent our vector as a sequence of elements which are the
18180 // largest scalar that we can load.
18181 EVT LoadUnitVecVT = EVT::getVectorVT(
18182 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18184 // Represent the data using the same element type that is stored in
18185 // memory. In practice, we ''widen'' MemVT.
18187 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18188 loadRegZize / MemVT.getScalarSizeInBits());
18190 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18191 "Invalid vector type");
18193 // We can't shuffle using an illegal type.
18194 assert(TLI.isTypeLegal(WideVecVT) &&
18195 "We only lower types that form legal widened vector types");
18197 SmallVector<SDValue, 8> Chains;
18198 SDValue Ptr = Ld->getBasePtr();
18199 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18200 TLI.getPointerTy(DAG.getDataLayout()));
18201 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18203 for (unsigned i = 0; i < NumLoads; ++i) {
18204 // Perform a single load.
18205 SDValue ScalarLoad =
18206 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18207 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18208 Chains.push_back(ScalarLoad.getValue(1));
18209 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18210 // another round of DAGCombining.
18212 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18214 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18215 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18217 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18220 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18222 // Bitcast the loaded value to a vector of the original element type, in
18223 // the size of the target vector type.
18224 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18225 unsigned SizeRatio = RegSz / MemSz;
18227 if (Ext == ISD::SEXTLOAD) {
18228 // If we have SSE4.1, we can directly emit a VSEXT node.
18229 if (Subtarget.hasSSE41()) {
18230 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18231 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18235 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18237 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18238 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18240 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18241 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18245 // Redistribute the loaded elements into the different locations.
18246 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18247 for (unsigned i = 0; i != NumElems; ++i)
18248 ShuffleVec[i * SizeRatio] = i;
18250 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18251 DAG.getUNDEF(WideVecVT), ShuffleVec);
18253 // Bitcast to the requested type.
18254 Shuff = DAG.getBitcast(RegVT, Shuff);
18255 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18259 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18260 /// each of which has no other use apart from the AND / OR.
18261 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18262 Opc = Op.getOpcode();
18263 if (Opc != ISD::OR && Opc != ISD::AND)
18265 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18266 Op.getOperand(0).hasOneUse() &&
18267 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18268 Op.getOperand(1).hasOneUse());
18271 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18272 /// SETCC node has a single use.
18273 static bool isXor1OfSetCC(SDValue Op) {
18274 if (Op.getOpcode() != ISD::XOR)
18276 if (isOneConstant(Op.getOperand(1)))
18277 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18278 Op.getOperand(0).hasOneUse();
18282 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18283 bool addTest = true;
18284 SDValue Chain = Op.getOperand(0);
18285 SDValue Cond = Op.getOperand(1);
18286 SDValue Dest = Op.getOperand(2);
18289 bool Inverted = false;
18291 if (Cond.getOpcode() == ISD::SETCC) {
18292 // Check for setcc([su]{add,sub,mul}o == 0).
18293 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18294 isNullConstant(Cond.getOperand(1)) &&
18295 Cond.getOperand(0).getResNo() == 1 &&
18296 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18297 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18298 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18299 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18300 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18301 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18303 Cond = Cond.getOperand(0);
18305 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18310 // FIXME: LowerXALUO doesn't handle these!!
18311 else if (Cond.getOpcode() == X86ISD::ADD ||
18312 Cond.getOpcode() == X86ISD::SUB ||
18313 Cond.getOpcode() == X86ISD::SMUL ||
18314 Cond.getOpcode() == X86ISD::UMUL)
18315 Cond = LowerXALUO(Cond, DAG);
18318 // Look pass (and (setcc_carry (cmp ...)), 1).
18319 if (Cond.getOpcode() == ISD::AND &&
18320 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18321 isOneConstant(Cond.getOperand(1)))
18322 Cond = Cond.getOperand(0);
18324 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18325 // setting operand in place of the X86ISD::SETCC.
18326 unsigned CondOpcode = Cond.getOpcode();
18327 if (CondOpcode == X86ISD::SETCC ||
18328 CondOpcode == X86ISD::SETCC_CARRY) {
18329 CC = Cond.getOperand(0);
18331 SDValue Cmp = Cond.getOperand(1);
18332 unsigned Opc = Cmp.getOpcode();
18333 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18334 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18338 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18342 // These can only come from an arithmetic instruction with overflow,
18343 // e.g. SADDO, UADDO.
18344 Cond = Cond.getOperand(1);
18350 CondOpcode = Cond.getOpcode();
18351 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18352 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18353 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18354 Cond.getOperand(0).getValueType() != MVT::i8)) {
18355 SDValue LHS = Cond.getOperand(0);
18356 SDValue RHS = Cond.getOperand(1);
18357 unsigned X86Opcode;
18360 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18361 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18363 switch (CondOpcode) {
18364 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18366 if (isOneConstant(RHS)) {
18367 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18370 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18371 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18373 if (isOneConstant(RHS)) {
18374 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18377 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18378 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18379 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18380 default: llvm_unreachable("unexpected overflowing operator");
18383 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18384 if (CondOpcode == ISD::UMULO)
18385 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18388 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18390 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18392 if (CondOpcode == ISD::UMULO)
18393 Cond = X86Op.getValue(2);
18395 Cond = X86Op.getValue(1);
18397 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18401 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18402 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18403 if (CondOpc == ISD::OR) {
18404 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18405 // two branches instead of an explicit OR instruction with a
18407 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18408 isX86LogicalCmp(Cmp)) {
18409 CC = Cond.getOperand(0).getOperand(0);
18410 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18411 Chain, Dest, CC, Cmp);
18412 CC = Cond.getOperand(1).getOperand(0);
18416 } else { // ISD::AND
18417 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18418 // two branches instead of an explicit AND instruction with a
18419 // separate test. However, we only do this if this block doesn't
18420 // have a fall-through edge, because this requires an explicit
18421 // jmp when the condition is false.
18422 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18423 isX86LogicalCmp(Cmp) &&
18424 Op.getNode()->hasOneUse()) {
18425 X86::CondCode CCode =
18426 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18427 CCode = X86::GetOppositeBranchCondition(CCode);
18428 CC = DAG.getConstant(CCode, dl, MVT::i8);
18429 SDNode *User = *Op.getNode()->use_begin();
18430 // Look for an unconditional branch following this conditional branch.
18431 // We need this because we need to reverse the successors in order
18432 // to implement FCMP_OEQ.
18433 if (User->getOpcode() == ISD::BR) {
18434 SDValue FalseBB = User->getOperand(1);
18436 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18437 assert(NewBR == User);
18441 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18442 Chain, Dest, CC, Cmp);
18443 X86::CondCode CCode =
18444 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18445 CCode = X86::GetOppositeBranchCondition(CCode);
18446 CC = DAG.getConstant(CCode, dl, MVT::i8);
18452 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18453 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18454 // It should be transformed during dag combiner except when the condition
18455 // is set by a arithmetics with overflow node.
18456 X86::CondCode CCode =
18457 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18458 CCode = X86::GetOppositeBranchCondition(CCode);
18459 CC = DAG.getConstant(CCode, dl, MVT::i8);
18460 Cond = Cond.getOperand(0).getOperand(1);
18462 } else if (Cond.getOpcode() == ISD::SETCC &&
18463 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18464 // For FCMP_OEQ, we can emit
18465 // two branches instead of an explicit AND instruction with a
18466 // separate test. However, we only do this if this block doesn't
18467 // have a fall-through edge, because this requires an explicit
18468 // jmp when the condition is false.
18469 if (Op.getNode()->hasOneUse()) {
18470 SDNode *User = *Op.getNode()->use_begin();
18471 // Look for an unconditional branch following this conditional branch.
18472 // We need this because we need to reverse the successors in order
18473 // to implement FCMP_OEQ.
18474 if (User->getOpcode() == ISD::BR) {
18475 SDValue FalseBB = User->getOperand(1);
18477 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18478 assert(NewBR == User);
18482 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18483 Cond.getOperand(0), Cond.getOperand(1));
18484 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18485 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18486 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18487 Chain, Dest, CC, Cmp);
18488 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18493 } else if (Cond.getOpcode() == ISD::SETCC &&
18494 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18495 // For FCMP_UNE, we can emit
18496 // two branches instead of an explicit AND instruction with a
18497 // separate test. However, we only do this if this block doesn't
18498 // have a fall-through edge, because this requires an explicit
18499 // jmp when the condition is false.
18500 if (Op.getNode()->hasOneUse()) {
18501 SDNode *User = *Op.getNode()->use_begin();
18502 // Look for an unconditional branch following this conditional branch.
18503 // We need this because we need to reverse the successors in order
18504 // to implement FCMP_UNE.
18505 if (User->getOpcode() == ISD::BR) {
18506 SDValue FalseBB = User->getOperand(1);
18508 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18509 assert(NewBR == User);
18512 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18513 Cond.getOperand(0), Cond.getOperand(1));
18514 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18515 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18516 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18517 Chain, Dest, CC, Cmp);
18518 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18528 // Look pass the truncate if the high bits are known zero.
18529 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18530 Cond = Cond.getOperand(0);
18532 // We know the result is compared against zero. Try to match it to BT.
18533 if (Cond.hasOneUse()) {
18534 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18535 CC = NewSetCC.getOperand(0);
18536 Cond = NewSetCC.getOperand(1);
18543 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18544 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18545 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18547 Cond = ConvertCmpIfNecessary(Cond, DAG);
18548 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18549 Chain, Dest, CC, Cond);
18552 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18553 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18554 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18555 // that the guard pages used by the OS virtual memory manager are allocated in
18556 // correct sequence.
18558 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18559 SelectionDAG &DAG) const {
18560 MachineFunction &MF = DAG.getMachineFunction();
18561 bool SplitStack = MF.shouldSplitStack();
18562 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18567 SDNode *Node = Op.getNode();
18568 SDValue Chain = Op.getOperand(0);
18569 SDValue Size = Op.getOperand(1);
18570 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18571 EVT VT = Node->getValueType(0);
18573 // Chain the dynamic stack allocation so that it doesn't modify the stack
18574 // pointer when other instructions are using the stack.
18575 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
18577 bool Is64Bit = Subtarget.is64Bit();
18578 MVT SPTy = getPointerTy(DAG.getDataLayout());
18582 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18583 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18584 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18585 " not tell us which reg is the stack pointer!");
18587 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18588 Chain = SP.getValue(1);
18589 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18590 unsigned StackAlign = TFI.getStackAlignment();
18591 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18592 if (Align > StackAlign)
18593 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18594 DAG.getConstant(-(uint64_t)Align, dl, VT));
18595 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18596 } else if (SplitStack) {
18597 MachineRegisterInfo &MRI = MF.getRegInfo();
18600 // The 64 bit implementation of segmented stacks needs to clobber both r10
18601 // r11. This makes it impossible to use it along with nested parameters.
18602 const Function *F = MF.getFunction();
18603 for (const auto &A : F->args()) {
18604 if (A.hasNestAttr())
18605 report_fatal_error("Cannot use segmented stacks with functions that "
18606 "have nested arguments.");
18610 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18611 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18612 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18613 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18614 DAG.getRegister(Vreg, SPTy));
18616 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18617 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18618 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18620 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18621 unsigned SPReg = RegInfo->getStackRegister();
18622 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18623 Chain = SP.getValue(1);
18626 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18627 DAG.getConstant(-(uint64_t)Align, dl, VT));
18628 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18634 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18635 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18637 SDValue Ops[2] = {Result, Chain};
18638 return DAG.getMergeValues(Ops, dl);
18641 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18642 MachineFunction &MF = DAG.getMachineFunction();
18643 auto PtrVT = getPointerTy(MF.getDataLayout());
18644 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18646 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18649 if (!Subtarget.is64Bit() ||
18650 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18651 // vastart just stores the address of the VarArgsFrameIndex slot into the
18652 // memory location argument.
18653 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18654 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18655 MachinePointerInfo(SV));
18659 // gp_offset (0 - 6 * 8)
18660 // fp_offset (48 - 48 + 8 * 16)
18661 // overflow_arg_area (point to parameters coming in memory).
18663 SmallVector<SDValue, 8> MemOps;
18664 SDValue FIN = Op.getOperand(1);
18666 SDValue Store = DAG.getStore(
18667 Op.getOperand(0), DL,
18668 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18669 MachinePointerInfo(SV));
18670 MemOps.push_back(Store);
18673 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18674 Store = DAG.getStore(
18675 Op.getOperand(0), DL,
18676 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18677 MachinePointerInfo(SV, 4));
18678 MemOps.push_back(Store);
18680 // Store ptr to overflow_arg_area
18681 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18682 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18684 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18685 MemOps.push_back(Store);
18687 // Store ptr to reg_save_area.
18688 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18689 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18690 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18691 Store = DAG.getStore(
18692 Op.getOperand(0), DL, RSFIN, FIN,
18693 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18694 MemOps.push_back(Store);
18695 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18698 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18699 assert(Subtarget.is64Bit() &&
18700 "LowerVAARG only handles 64-bit va_arg!");
18701 assert(Op.getNumOperands() == 4);
18703 MachineFunction &MF = DAG.getMachineFunction();
18704 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18705 // The Win64 ABI uses char* instead of a structure.
18706 return DAG.expandVAArg(Op.getNode());
18708 SDValue Chain = Op.getOperand(0);
18709 SDValue SrcPtr = Op.getOperand(1);
18710 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18711 unsigned Align = Op.getConstantOperandVal(3);
18714 EVT ArgVT = Op.getNode()->getValueType(0);
18715 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18716 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18719 // Decide which area this value should be read from.
18720 // TODO: Implement the AMD64 ABI in its entirety. This simple
18721 // selection mechanism works only for the basic types.
18722 if (ArgVT == MVT::f80) {
18723 llvm_unreachable("va_arg for f80 not yet implemented");
18724 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18725 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18726 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18727 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18729 llvm_unreachable("Unhandled argument type in LowerVAARG");
18732 if (ArgMode == 2) {
18733 // Sanity Check: Make sure using fp_offset makes sense.
18734 assert(!Subtarget.useSoftFloat() &&
18735 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18736 Subtarget.hasSSE1());
18739 // Insert VAARG_64 node into the DAG
18740 // VAARG_64 returns two values: Variable Argument Address, Chain
18741 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18742 DAG.getConstant(ArgMode, dl, MVT::i8),
18743 DAG.getConstant(Align, dl, MVT::i32)};
18744 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18745 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18746 VTs, InstOps, MVT::i64,
18747 MachinePointerInfo(SV),
18749 /*Volatile=*/false,
18751 /*WriteMem=*/true);
18752 Chain = VAARG.getValue(1);
18754 // Load the next argument and return it
18755 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18758 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18759 SelectionDAG &DAG) {
18760 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18761 // where a va_list is still an i8*.
18762 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18763 if (Subtarget.isCallingConvWin64(
18764 DAG.getMachineFunction().getFunction()->getCallingConv()))
18765 // Probably a Win64 va_copy.
18766 return DAG.expandVACopy(Op.getNode());
18768 SDValue Chain = Op.getOperand(0);
18769 SDValue DstPtr = Op.getOperand(1);
18770 SDValue SrcPtr = Op.getOperand(2);
18771 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18772 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18775 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18776 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18778 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18781 /// Handle vector element shifts where the shift amount is a constant.
18782 /// Takes immediate version of shift as input.
18783 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18784 SDValue SrcOp, uint64_t ShiftAmt,
18785 SelectionDAG &DAG) {
18786 MVT ElementType = VT.getVectorElementType();
18788 // Bitcast the source vector to the output type, this is mainly necessary for
18789 // vXi8/vXi64 shifts.
18790 if (VT != SrcOp.getSimpleValueType())
18791 SrcOp = DAG.getBitcast(VT, SrcOp);
18793 // Fold this packed shift into its first operand if ShiftAmt is 0.
18797 // Check for ShiftAmt >= element width
18798 if (ShiftAmt >= ElementType.getSizeInBits()) {
18799 if (Opc == X86ISD::VSRAI)
18800 ShiftAmt = ElementType.getSizeInBits() - 1;
18802 return DAG.getConstant(0, dl, VT);
18805 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18806 && "Unknown target vector shift-by-constant node");
18808 // Fold this packed vector shift into a build vector if SrcOp is a
18809 // vector of Constants or UNDEFs.
18810 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18811 SmallVector<SDValue, 8> Elts;
18812 unsigned NumElts = SrcOp->getNumOperands();
18813 ConstantSDNode *ND;
18816 default: llvm_unreachable("Unknown opcode!");
18817 case X86ISD::VSHLI:
18818 for (unsigned i=0; i!=NumElts; ++i) {
18819 SDValue CurrentOp = SrcOp->getOperand(i);
18820 if (CurrentOp->isUndef()) {
18821 Elts.push_back(CurrentOp);
18824 ND = cast<ConstantSDNode>(CurrentOp);
18825 const APInt &C = ND->getAPIntValue();
18826 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18829 case X86ISD::VSRLI:
18830 for (unsigned i=0; i!=NumElts; ++i) {
18831 SDValue CurrentOp = SrcOp->getOperand(i);
18832 if (CurrentOp->isUndef()) {
18833 Elts.push_back(CurrentOp);
18836 ND = cast<ConstantSDNode>(CurrentOp);
18837 const APInt &C = ND->getAPIntValue();
18838 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18841 case X86ISD::VSRAI:
18842 for (unsigned i=0; i!=NumElts; ++i) {
18843 SDValue CurrentOp = SrcOp->getOperand(i);
18844 if (CurrentOp->isUndef()) {
18845 Elts.push_back(CurrentOp);
18848 ND = cast<ConstantSDNode>(CurrentOp);
18849 const APInt &C = ND->getAPIntValue();
18850 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18855 return DAG.getBuildVector(VT, dl, Elts);
18858 return DAG.getNode(Opc, dl, VT, SrcOp,
18859 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18862 /// Handle vector element shifts where the shift amount may or may not be a
18863 /// constant. Takes immediate version of shift as input.
18864 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18865 SDValue SrcOp, SDValue ShAmt,
18866 const X86Subtarget &Subtarget,
18867 SelectionDAG &DAG) {
18868 MVT SVT = ShAmt.getSimpleValueType();
18869 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18871 // Catch shift-by-constant.
18872 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18873 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18874 CShAmt->getZExtValue(), DAG);
18876 // Change opcode to non-immediate version
18878 default: llvm_unreachable("Unknown target vector shift node");
18879 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18880 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18881 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18884 // Need to build a vector containing shift amount.
18885 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18886 // +=================+============+=======================================+
18887 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18888 // +=================+============+=======================================+
18889 // | i64 | Yes, No | Use ShAmt as lowest elt |
18890 // | i32 | Yes | zero-extend in-reg |
18891 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18892 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18893 // +=================+============+=======================================+
18895 if (SVT == MVT::i64)
18896 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18897 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18898 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18899 ShAmt = ShAmt.getOperand(0);
18900 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18901 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18902 } else if (Subtarget.hasSSE41() &&
18903 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18904 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18905 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18907 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18908 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18909 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18912 // The return type has to be a 128-bit type with the same element
18913 // type as the input type.
18914 MVT EltVT = VT.getVectorElementType();
18915 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18917 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18918 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18921 /// \brief Return Mask with the necessary casting or extending
18922 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18923 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18924 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18927 if (isAllOnesConstant(Mask))
18928 return DAG.getTargetConstant(1, dl, MaskVT);
18929 if (X86::isZeroNode(Mask))
18930 return DAG.getTargetConstant(0, dl, MaskVT);
18932 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18933 // Mask should be extended
18934 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18935 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18938 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18939 if (MaskVT == MVT::v64i1) {
18940 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18941 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18943 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18944 DAG.getConstant(0, dl, MVT::i32));
18945 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18946 DAG.getConstant(1, dl, MVT::i32));
18948 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18949 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18951 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18953 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18955 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18956 return DAG.getBitcast(MaskVT,
18957 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18961 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18962 Mask.getSimpleValueType().getSizeInBits());
18963 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
18964 // are extracted by EXTRACT_SUBVECTOR.
18965 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18966 DAG.getBitcast(BitcastVT, Mask),
18967 DAG.getIntPtrConstant(0, dl));
18971 /// \brief Return (and \p Op, \p Mask) for compare instructions or
18972 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
18973 /// necessary casting or extending for \p Mask when lowering masking intrinsics
18974 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
18975 SDValue PreservedSrc,
18976 const X86Subtarget &Subtarget,
18977 SelectionDAG &DAG) {
18978 MVT VT = Op.getSimpleValueType();
18979 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18980 unsigned OpcodeSelect = ISD::VSELECT;
18983 if (isAllOnesConstant(Mask))
18986 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18988 switch (Op.getOpcode()) {
18990 case X86ISD::PCMPEQM:
18991 case X86ISD::PCMPGTM:
18993 case X86ISD::CMPMU:
18994 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
18995 case X86ISD::VFPCLASS:
18996 case X86ISD::VFPCLASSS:
18997 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
18998 case X86ISD::VTRUNC:
18999 case X86ISD::VTRUNCS:
19000 case X86ISD::VTRUNCUS:
19001 case X86ISD::CVTPS2PH:
19002 // We can't use ISD::VSELECT here because it is not always "Legal"
19003 // for the destination type. For example vpmovqb require only AVX512
19004 // and vselect that can operate on byte element type require BWI
19005 OpcodeSelect = X86ISD::SELECT;
19008 if (PreservedSrc.isUndef())
19009 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19010 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19013 /// \brief Creates an SDNode for a predicated scalar operation.
19014 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19015 /// The mask is coming as MVT::i8 and it should be truncated
19016 /// to MVT::i1 while lowering masking intrinsics.
19017 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19018 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19019 /// for a scalar instruction.
19020 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19021 SDValue PreservedSrc,
19022 const X86Subtarget &Subtarget,
19023 SelectionDAG &DAG) {
19024 if (isAllOnesConstant(Mask))
19027 MVT VT = Op.getSimpleValueType();
19029 // The mask should be of type MVT::i1
19030 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
19032 if (Op.getOpcode() == X86ISD::FSETCCM ||
19033 Op.getOpcode() == X86ISD::FSETCCM_RND)
19034 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19035 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19036 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19038 if (PreservedSrc.isUndef())
19039 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19040 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19043 static int getSEHRegistrationNodeSize(const Function *Fn) {
19044 if (!Fn->hasPersonalityFn())
19045 report_fatal_error(
19046 "querying registration node size for function without personality");
19047 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19048 // WinEHStatePass for the full struct definition.
19049 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19050 case EHPersonality::MSVC_X86SEH: return 24;
19051 case EHPersonality::MSVC_CXX: return 16;
19054 report_fatal_error(
19055 "can only recover FP for 32-bit MSVC EH personality functions");
19058 /// When the MSVC runtime transfers control to us, either to an outlined
19059 /// function or when returning to a parent frame after catching an exception, we
19060 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19061 /// Here's the math:
19062 /// RegNodeBase = EntryEBP - RegNodeSize
19063 /// ParentFP = RegNodeBase - ParentFrameOffset
19064 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19065 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19066 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19067 SDValue EntryEBP) {
19068 MachineFunction &MF = DAG.getMachineFunction();
19071 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19072 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19074 // It's possible that the parent function no longer has a personality function
19075 // if the exceptional code was optimized away, in which case we just return
19076 // the incoming EBP.
19077 if (!Fn->hasPersonalityFn())
19080 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19081 // registration, or the .set_setframe offset.
19082 MCSymbol *OffsetSym =
19083 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19084 GlobalValue::getRealLinkageName(Fn->getName()));
19085 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19086 SDValue ParentFrameOffset =
19087 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19089 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19090 // prologue to RBP in the parent function.
19091 const X86Subtarget &Subtarget =
19092 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19093 if (Subtarget.is64Bit())
19094 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19096 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19097 // RegNodeBase = EntryEBP - RegNodeSize
19098 // ParentFP = RegNodeBase - ParentFrameOffset
19099 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19100 DAG.getConstant(RegNodeSize, dl, PtrVT));
19101 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19104 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19105 SelectionDAG &DAG) {
19106 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19107 auto isRoundModeCurDirection = [](SDValue Rnd) {
19108 if (!isa<ConstantSDNode>(Rnd))
19111 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19112 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19116 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19117 MVT VT = Op.getSimpleValueType();
19118 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19120 switch(IntrData->Type) {
19121 case INTR_TYPE_1OP:
19122 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19123 case INTR_TYPE_2OP:
19124 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19126 case INTR_TYPE_3OP:
19127 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19128 Op.getOperand(2), Op.getOperand(3));
19129 case INTR_TYPE_4OP:
19130 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19131 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19132 case INTR_TYPE_1OP_MASK_RM: {
19133 SDValue Src = Op.getOperand(1);
19134 SDValue PassThru = Op.getOperand(2);
19135 SDValue Mask = Op.getOperand(3);
19136 SDValue RoundingMode;
19137 // We always add rounding mode to the Node.
19138 // If the rounding mode is not specified, we add the
19139 // "current direction" mode.
19140 if (Op.getNumOperands() == 4)
19142 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19144 RoundingMode = Op.getOperand(4);
19145 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19146 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19148 Mask, PassThru, Subtarget, DAG);
19150 case INTR_TYPE_1OP_MASK: {
19151 SDValue Src = Op.getOperand(1);
19152 SDValue PassThru = Op.getOperand(2);
19153 SDValue Mask = Op.getOperand(3);
19154 // We add rounding mode to the Node when
19155 // - RM Opcode is specified and
19156 // - RM is not "current direction".
19157 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19158 if (IntrWithRoundingModeOpcode != 0) {
19159 SDValue Rnd = Op.getOperand(4);
19160 if (!isRoundModeCurDirection(Rnd)) {
19161 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19162 dl, Op.getValueType(),
19164 Mask, PassThru, Subtarget, DAG);
19167 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19168 Mask, PassThru, Subtarget, DAG);
19170 case INTR_TYPE_SCALAR_MASK: {
19171 SDValue Src1 = Op.getOperand(1);
19172 SDValue Src2 = Op.getOperand(2);
19173 SDValue passThru = Op.getOperand(3);
19174 SDValue Mask = Op.getOperand(4);
19175 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19176 if (IntrWithRoundingModeOpcode != 0) {
19177 SDValue Rnd = Op.getOperand(5);
19178 if (!isRoundModeCurDirection(Rnd))
19179 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19180 dl, VT, Src1, Src2, Rnd),
19181 Mask, passThru, Subtarget, DAG);
19183 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19184 Mask, passThru, Subtarget, DAG);
19186 case INTR_TYPE_SCALAR_MASK_RM: {
19187 SDValue Src1 = Op.getOperand(1);
19188 SDValue Src2 = Op.getOperand(2);
19189 SDValue Src0 = Op.getOperand(3);
19190 SDValue Mask = Op.getOperand(4);
19191 // There are 2 kinds of intrinsics in this group:
19192 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19193 // (2) With rounding mode and sae - 7 operands.
19194 if (Op.getNumOperands() == 6) {
19195 SDValue Sae = Op.getOperand(5);
19196 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19198 Mask, Src0, Subtarget, DAG);
19200 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19201 SDValue RoundingMode = Op.getOperand(5);
19202 SDValue Sae = Op.getOperand(6);
19203 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19204 RoundingMode, Sae),
19205 Mask, Src0, Subtarget, DAG);
19207 case INTR_TYPE_2OP_MASK:
19208 case INTR_TYPE_2OP_IMM8_MASK: {
19209 SDValue Src1 = Op.getOperand(1);
19210 SDValue Src2 = Op.getOperand(2);
19211 SDValue PassThru = Op.getOperand(3);
19212 SDValue Mask = Op.getOperand(4);
19214 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19215 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19217 // We specify 2 possible opcodes for intrinsics with rounding modes.
19218 // First, we check if the intrinsic may have non-default rounding mode,
19219 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19220 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19221 if (IntrWithRoundingModeOpcode != 0) {
19222 SDValue Rnd = Op.getOperand(5);
19223 if (!isRoundModeCurDirection(Rnd)) {
19224 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19225 dl, Op.getValueType(),
19227 Mask, PassThru, Subtarget, DAG);
19230 // TODO: Intrinsics should have fast-math-flags to propagate.
19231 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19232 Mask, PassThru, Subtarget, DAG);
19234 case INTR_TYPE_2OP_MASK_RM: {
19235 SDValue Src1 = Op.getOperand(1);
19236 SDValue Src2 = Op.getOperand(2);
19237 SDValue PassThru = Op.getOperand(3);
19238 SDValue Mask = Op.getOperand(4);
19239 // We specify 2 possible modes for intrinsics, with/without rounding
19241 // First, we check if the intrinsic have rounding mode (6 operands),
19242 // if not, we set rounding mode to "current".
19244 if (Op.getNumOperands() == 6)
19245 Rnd = Op.getOperand(5);
19247 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19248 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19250 Mask, PassThru, Subtarget, DAG);
19252 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19253 SDValue Src1 = Op.getOperand(1);
19254 SDValue Src2 = Op.getOperand(2);
19255 SDValue Src3 = Op.getOperand(3);
19256 SDValue PassThru = Op.getOperand(4);
19257 SDValue Mask = Op.getOperand(5);
19258 SDValue Sae = Op.getOperand(6);
19260 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19262 Mask, PassThru, Subtarget, DAG);
19264 case INTR_TYPE_3OP_MASK_RM: {
19265 SDValue Src1 = Op.getOperand(1);
19266 SDValue Src2 = Op.getOperand(2);
19267 SDValue Imm = Op.getOperand(3);
19268 SDValue PassThru = Op.getOperand(4);
19269 SDValue Mask = Op.getOperand(5);
19270 // We specify 2 possible modes for intrinsics, with/without rounding
19272 // First, we check if the intrinsic have rounding mode (7 operands),
19273 // if not, we set rounding mode to "current".
19275 if (Op.getNumOperands() == 7)
19276 Rnd = Op.getOperand(6);
19278 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19279 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19280 Src1, Src2, Imm, Rnd),
19281 Mask, PassThru, Subtarget, DAG);
19283 case INTR_TYPE_3OP_IMM8_MASK:
19284 case INTR_TYPE_3OP_MASK: {
19285 SDValue Src1 = Op.getOperand(1);
19286 SDValue Src2 = Op.getOperand(2);
19287 SDValue Src3 = Op.getOperand(3);
19288 SDValue PassThru = Op.getOperand(4);
19289 SDValue Mask = Op.getOperand(5);
19291 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19292 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19294 // We specify 2 possible opcodes for intrinsics with rounding modes.
19295 // First, we check if the intrinsic may have non-default rounding mode,
19296 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19297 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19298 if (IntrWithRoundingModeOpcode != 0) {
19299 SDValue Rnd = Op.getOperand(6);
19300 if (!isRoundModeCurDirection(Rnd)) {
19301 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19302 dl, Op.getValueType(),
19303 Src1, Src2, Src3, Rnd),
19304 Mask, PassThru, Subtarget, DAG);
19307 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19309 Mask, PassThru, Subtarget, DAG);
19311 case VPERM_2OP_MASK : {
19312 SDValue Src1 = Op.getOperand(1);
19313 SDValue Src2 = Op.getOperand(2);
19314 SDValue PassThru = Op.getOperand(3);
19315 SDValue Mask = Op.getOperand(4);
19317 // Swap Src1 and Src2 in the node creation
19318 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19319 Mask, PassThru, Subtarget, DAG);
19321 case VPERM_3OP_MASKZ:
19322 case VPERM_3OP_MASK:{
19323 MVT VT = Op.getSimpleValueType();
19324 // Src2 is the PassThru
19325 SDValue Src1 = Op.getOperand(1);
19326 // PassThru needs to be the same type as the destination in order
19327 // to pattern match correctly.
19328 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19329 SDValue Src3 = Op.getOperand(3);
19330 SDValue Mask = Op.getOperand(4);
19331 SDValue PassThru = SDValue();
19333 // set PassThru element
19334 if (IntrData->Type == VPERM_3OP_MASKZ)
19335 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19339 // Swap Src1 and Src2 in the node creation
19340 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19341 dl, Op.getValueType(),
19343 Mask, PassThru, Subtarget, DAG);
19347 case FMA_OP_MASK: {
19348 SDValue Src1 = Op.getOperand(1);
19349 SDValue Src2 = Op.getOperand(2);
19350 SDValue Src3 = Op.getOperand(3);
19351 SDValue Mask = Op.getOperand(4);
19352 MVT VT = Op.getSimpleValueType();
19353 SDValue PassThru = SDValue();
19355 // set PassThru element
19356 if (IntrData->Type == FMA_OP_MASKZ)
19357 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19358 else if (IntrData->Type == FMA_OP_MASK3)
19363 // We specify 2 possible opcodes for intrinsics with rounding modes.
19364 // First, we check if the intrinsic may have non-default rounding mode,
19365 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19366 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19367 if (IntrWithRoundingModeOpcode != 0) {
19368 SDValue Rnd = Op.getOperand(5);
19369 if (!isRoundModeCurDirection(Rnd))
19370 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19371 dl, Op.getValueType(),
19372 Src1, Src2, Src3, Rnd),
19373 Mask, PassThru, Subtarget, DAG);
19375 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19376 dl, Op.getValueType(),
19378 Mask, PassThru, Subtarget, DAG);
19380 case FMA_OP_SCALAR_MASK:
19381 case FMA_OP_SCALAR_MASK3:
19382 case FMA_OP_SCALAR_MASKZ: {
19383 SDValue Src1 = Op.getOperand(1);
19384 SDValue Src2 = Op.getOperand(2);
19385 SDValue Src3 = Op.getOperand(3);
19386 SDValue Mask = Op.getOperand(4);
19387 MVT VT = Op.getSimpleValueType();
19388 SDValue PassThru = SDValue();
19390 // set PassThru element
19391 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19392 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19393 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19398 SDValue Rnd = Op.getOperand(5);
19399 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19400 Op.getValueType(), Src1, Src2,
19402 Mask, PassThru, Subtarget, DAG);
19404 case TERLOG_OP_MASK:
19405 case TERLOG_OP_MASKZ: {
19406 SDValue Src1 = Op.getOperand(1);
19407 SDValue Src2 = Op.getOperand(2);
19408 SDValue Src3 = Op.getOperand(3);
19409 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19410 SDValue Mask = Op.getOperand(5);
19411 MVT VT = Op.getSimpleValueType();
19412 SDValue PassThru = Src1;
19413 // Set PassThru element.
19414 if (IntrData->Type == TERLOG_OP_MASKZ)
19415 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19417 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19418 Src1, Src2, Src3, Src4),
19419 Mask, PassThru, Subtarget, DAG);
19422 // ISD::FP_ROUND has a second argument that indicates if the truncation
19423 // does not change the value. Set it to 0 since it can change.
19424 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19425 DAG.getIntPtrConstant(0, dl));
19426 case CVTPD2PS_MASK: {
19427 SDValue Src = Op.getOperand(1);
19428 SDValue PassThru = Op.getOperand(2);
19429 SDValue Mask = Op.getOperand(3);
19430 // We add rounding mode to the Node when
19431 // - RM Opcode is specified and
19432 // - RM is not "current direction".
19433 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19434 if (IntrWithRoundingModeOpcode != 0) {
19435 SDValue Rnd = Op.getOperand(4);
19436 if (!isRoundModeCurDirection(Rnd)) {
19437 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19438 dl, Op.getValueType(),
19440 Mask, PassThru, Subtarget, DAG);
19443 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19444 // ISD::FP_ROUND has a second argument that indicates if the truncation
19445 // does not change the value. Set it to 0 since it can change.
19446 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19447 DAG.getIntPtrConstant(0, dl)),
19448 Mask, PassThru, Subtarget, DAG);
19451 // FPclass intrinsics with mask
19452 SDValue Src1 = Op.getOperand(1);
19453 MVT VT = Src1.getSimpleValueType();
19454 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19455 SDValue Imm = Op.getOperand(2);
19456 SDValue Mask = Op.getOperand(3);
19457 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19458 Mask.getSimpleValueType().getSizeInBits());
19459 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19460 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19461 DAG.getTargetConstant(0, dl, MaskVT),
19463 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19464 DAG.getUNDEF(BitcastVT), FPclassMask,
19465 DAG.getIntPtrConstant(0, dl));
19466 return DAG.getBitcast(Op.getValueType(), Res);
19469 SDValue Src1 = Op.getOperand(1);
19470 SDValue Imm = Op.getOperand(2);
19471 SDValue Mask = Op.getOperand(3);
19472 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
19473 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19474 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19475 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
19478 case CMP_MASK_CC: {
19479 // Comparison intrinsics with masks.
19480 // Example of transformation:
19481 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19482 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19484 // (v8i1 (insert_subvector undef,
19485 // (v2i1 (and (PCMPEQM %a, %b),
19486 // (extract_subvector
19487 // (v8i1 (bitcast %mask)), 0))), 0))))
19488 MVT VT = Op.getOperand(1).getSimpleValueType();
19489 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19490 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19491 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19492 Mask.getSimpleValueType().getSizeInBits());
19494 if (IntrData->Type == CMP_MASK_CC) {
19495 SDValue CC = Op.getOperand(3);
19496 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19497 // We specify 2 possible opcodes for intrinsics with rounding modes.
19498 // First, we check if the intrinsic may have non-default rounding mode,
19499 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19500 if (IntrData->Opc1 != 0) {
19501 SDValue Rnd = Op.getOperand(5);
19502 if (!isRoundModeCurDirection(Rnd))
19503 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19504 Op.getOperand(2), CC, Rnd);
19506 //default rounding mode
19508 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19509 Op.getOperand(2), CC);
19512 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19513 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19516 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19517 DAG.getTargetConstant(0, dl,
19520 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19521 DAG.getUNDEF(BitcastVT), CmpMask,
19522 DAG.getIntPtrConstant(0, dl));
19523 return DAG.getBitcast(Op.getValueType(), Res);
19525 case CMP_MASK_SCALAR_CC: {
19526 SDValue Src1 = Op.getOperand(1);
19527 SDValue Src2 = Op.getOperand(2);
19528 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19529 SDValue Mask = Op.getOperand(4);
19532 if (IntrData->Opc1 != 0) {
19533 SDValue Rnd = Op.getOperand(5);
19534 if (!isRoundModeCurDirection(Rnd))
19535 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
19537 //default rounding mode
19539 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
19541 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19542 DAG.getTargetConstant(0, dl,
19546 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
19548 case COMI: { // Comparison intrinsics
19549 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19550 SDValue LHS = Op.getOperand(1);
19551 SDValue RHS = Op.getOperand(2);
19552 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19553 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19556 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19557 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19558 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19559 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19562 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19563 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19564 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19565 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19568 case ISD::SETGT: // (CF = 0 and ZF = 0)
19569 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19571 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19572 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19575 case ISD::SETGE: // CF = 0
19576 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19578 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19579 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19582 llvm_unreachable("Unexpected illegal condition!");
19584 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19586 case COMI_RM: { // Comparison intrinsics with Sae
19587 SDValue LHS = Op.getOperand(1);
19588 SDValue RHS = Op.getOperand(2);
19589 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19590 SDValue Sae = Op.getOperand(4);
19593 if (isRoundModeCurDirection(Sae))
19594 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
19595 DAG.getConstant(CondVal, dl, MVT::i8));
19597 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
19598 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19599 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
19600 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
19603 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19604 Op.getOperand(1), Op.getOperand(2), Subtarget,
19606 case COMPRESS_EXPAND_IN_REG: {
19607 SDValue Mask = Op.getOperand(3);
19608 SDValue DataToCompress = Op.getOperand(1);
19609 SDValue PassThru = Op.getOperand(2);
19610 if (isAllOnesConstant(Mask)) // return data as is
19611 return Op.getOperand(1);
19613 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19615 Mask, PassThru, Subtarget, DAG);
19618 SDValue Mask = Op.getOperand(1);
19619 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19620 Mask.getSimpleValueType().getSizeInBits());
19621 Mask = DAG.getBitcast(MaskVT, Mask);
19622 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19625 MVT VT = Op.getSimpleValueType();
19626 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19628 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19629 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19630 // Arguments should be swapped.
19631 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19632 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19634 return DAG.getBitcast(VT, Res);
19637 MVT VT = Op.getSimpleValueType();
19638 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19640 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19641 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19642 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19643 return DAG.getBitcast(VT, Res);
19646 case FIXUPIMMS_MASKZ:
19648 case FIXUPIMM_MASKZ:{
19649 SDValue Src1 = Op.getOperand(1);
19650 SDValue Src2 = Op.getOperand(2);
19651 SDValue Src3 = Op.getOperand(3);
19652 SDValue Imm = Op.getOperand(4);
19653 SDValue Mask = Op.getOperand(5);
19654 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19655 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19656 // We specify 2 possible modes for intrinsics, with/without rounding
19658 // First, we check if the intrinsic have rounding mode (7 operands),
19659 // if not, we set rounding mode to "current".
19661 if (Op.getNumOperands() == 7)
19662 Rnd = Op.getOperand(6);
19664 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19665 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19666 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19667 Src1, Src2, Src3, Imm, Rnd),
19668 Mask, Passthru, Subtarget, DAG);
19669 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19670 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19671 Src1, Src2, Src3, Imm, Rnd),
19672 Mask, Passthru, Subtarget, DAG);
19674 case CONVERT_TO_MASK: {
19675 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19676 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19677 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19679 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19681 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19682 DAG.getUNDEF(BitcastVT), CvtMask,
19683 DAG.getIntPtrConstant(0, dl));
19684 return DAG.getBitcast(Op.getValueType(), Res);
19686 case CONVERT_MASK_TO_VEC: {
19687 SDValue Mask = Op.getOperand(1);
19688 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19689 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19690 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19692 case BRCST_SUBVEC_TO_VEC: {
19693 SDValue Src = Op.getOperand(1);
19694 SDValue Passthru = Op.getOperand(2);
19695 SDValue Mask = Op.getOperand(3);
19696 EVT resVT = Passthru.getValueType();
19697 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19698 DAG.getUNDEF(resVT), Src,
19699 DAG.getIntPtrConstant(0, dl));
19701 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19702 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19704 immVal = DAG.getConstant(0, dl, MVT::i8);
19705 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19706 subVec, subVec, immVal),
19707 Mask, Passthru, Subtarget, DAG);
19709 case BRCST32x2_TO_VEC: {
19710 SDValue Src = Op.getOperand(1);
19711 SDValue PassThru = Op.getOperand(2);
19712 SDValue Mask = Op.getOperand(3);
19714 assert((VT.getScalarType() == MVT::i32 ||
19715 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19716 //bitcast Src to packed 64
19717 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19718 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19719 Src = DAG.getBitcast(BitcastVT, Src);
19721 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19722 Mask, PassThru, Subtarget, DAG);
19730 default: return SDValue(); // Don't custom lower most intrinsics.
19732 case Intrinsic::x86_avx2_permd:
19733 case Intrinsic::x86_avx2_permps:
19734 // Operands intentionally swapped. Mask is last operand to intrinsic,
19735 // but second operand for node/instruction.
19736 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19737 Op.getOperand(2), Op.getOperand(1));
19739 // ptest and testp intrinsics. The intrinsic these come from are designed to
19740 // return an integer value, not just an instruction so lower it to the ptest
19741 // or testp pattern and a setcc for the result.
19742 case Intrinsic::x86_sse41_ptestz:
19743 case Intrinsic::x86_sse41_ptestc:
19744 case Intrinsic::x86_sse41_ptestnzc:
19745 case Intrinsic::x86_avx_ptestz_256:
19746 case Intrinsic::x86_avx_ptestc_256:
19747 case Intrinsic::x86_avx_ptestnzc_256:
19748 case Intrinsic::x86_avx_vtestz_ps:
19749 case Intrinsic::x86_avx_vtestc_ps:
19750 case Intrinsic::x86_avx_vtestnzc_ps:
19751 case Intrinsic::x86_avx_vtestz_pd:
19752 case Intrinsic::x86_avx_vtestc_pd:
19753 case Intrinsic::x86_avx_vtestnzc_pd:
19754 case Intrinsic::x86_avx_vtestz_ps_256:
19755 case Intrinsic::x86_avx_vtestc_ps_256:
19756 case Intrinsic::x86_avx_vtestnzc_ps_256:
19757 case Intrinsic::x86_avx_vtestz_pd_256:
19758 case Intrinsic::x86_avx_vtestc_pd_256:
19759 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19760 bool IsTestPacked = false;
19761 X86::CondCode X86CC;
19763 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19764 case Intrinsic::x86_avx_vtestz_ps:
19765 case Intrinsic::x86_avx_vtestz_pd:
19766 case Intrinsic::x86_avx_vtestz_ps_256:
19767 case Intrinsic::x86_avx_vtestz_pd_256:
19768 IsTestPacked = true;
19770 case Intrinsic::x86_sse41_ptestz:
19771 case Intrinsic::x86_avx_ptestz_256:
19773 X86CC = X86::COND_E;
19775 case Intrinsic::x86_avx_vtestc_ps:
19776 case Intrinsic::x86_avx_vtestc_pd:
19777 case Intrinsic::x86_avx_vtestc_ps_256:
19778 case Intrinsic::x86_avx_vtestc_pd_256:
19779 IsTestPacked = true;
19781 case Intrinsic::x86_sse41_ptestc:
19782 case Intrinsic::x86_avx_ptestc_256:
19784 X86CC = X86::COND_B;
19786 case Intrinsic::x86_avx_vtestnzc_ps:
19787 case Intrinsic::x86_avx_vtestnzc_pd:
19788 case Intrinsic::x86_avx_vtestnzc_ps_256:
19789 case Intrinsic::x86_avx_vtestnzc_pd_256:
19790 IsTestPacked = true;
19792 case Intrinsic::x86_sse41_ptestnzc:
19793 case Intrinsic::x86_avx_ptestnzc_256:
19795 X86CC = X86::COND_A;
19799 SDValue LHS = Op.getOperand(1);
19800 SDValue RHS = Op.getOperand(2);
19801 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19802 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19803 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19804 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19806 case Intrinsic::x86_avx512_kortestz_w:
19807 case Intrinsic::x86_avx512_kortestc_w: {
19808 X86::CondCode X86CC =
19809 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19810 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19811 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19812 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19813 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19814 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19817 case Intrinsic::x86_avx512_knot_w: {
19818 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19819 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19820 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19821 return DAG.getBitcast(MVT::i16, Res);
19824 case Intrinsic::x86_avx512_kandn_w: {
19825 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19826 // Invert LHS for the not.
19827 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19828 DAG.getConstant(1, dl, MVT::v16i1));
19829 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19830 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19831 return DAG.getBitcast(MVT::i16, Res);
19834 case Intrinsic::x86_avx512_kxnor_w: {
19835 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19836 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19837 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19838 // Invert result for the not.
19839 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
19840 DAG.getConstant(1, dl, MVT::v16i1));
19841 return DAG.getBitcast(MVT::i16, Res);
19844 case Intrinsic::x86_sse42_pcmpistria128:
19845 case Intrinsic::x86_sse42_pcmpestria128:
19846 case Intrinsic::x86_sse42_pcmpistric128:
19847 case Intrinsic::x86_sse42_pcmpestric128:
19848 case Intrinsic::x86_sse42_pcmpistrio128:
19849 case Intrinsic::x86_sse42_pcmpestrio128:
19850 case Intrinsic::x86_sse42_pcmpistris128:
19851 case Intrinsic::x86_sse42_pcmpestris128:
19852 case Intrinsic::x86_sse42_pcmpistriz128:
19853 case Intrinsic::x86_sse42_pcmpestriz128: {
19855 X86::CondCode X86CC;
19857 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19858 case Intrinsic::x86_sse42_pcmpistria128:
19859 Opcode = X86ISD::PCMPISTRI;
19860 X86CC = X86::COND_A;
19862 case Intrinsic::x86_sse42_pcmpestria128:
19863 Opcode = X86ISD::PCMPESTRI;
19864 X86CC = X86::COND_A;
19866 case Intrinsic::x86_sse42_pcmpistric128:
19867 Opcode = X86ISD::PCMPISTRI;
19868 X86CC = X86::COND_B;
19870 case Intrinsic::x86_sse42_pcmpestric128:
19871 Opcode = X86ISD::PCMPESTRI;
19872 X86CC = X86::COND_B;
19874 case Intrinsic::x86_sse42_pcmpistrio128:
19875 Opcode = X86ISD::PCMPISTRI;
19876 X86CC = X86::COND_O;
19878 case Intrinsic::x86_sse42_pcmpestrio128:
19879 Opcode = X86ISD::PCMPESTRI;
19880 X86CC = X86::COND_O;
19882 case Intrinsic::x86_sse42_pcmpistris128:
19883 Opcode = X86ISD::PCMPISTRI;
19884 X86CC = X86::COND_S;
19886 case Intrinsic::x86_sse42_pcmpestris128:
19887 Opcode = X86ISD::PCMPESTRI;
19888 X86CC = X86::COND_S;
19890 case Intrinsic::x86_sse42_pcmpistriz128:
19891 Opcode = X86ISD::PCMPISTRI;
19892 X86CC = X86::COND_E;
19894 case Intrinsic::x86_sse42_pcmpestriz128:
19895 Opcode = X86ISD::PCMPESTRI;
19896 X86CC = X86::COND_E;
19899 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19900 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19901 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19902 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19903 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19906 case Intrinsic::x86_sse42_pcmpistri128:
19907 case Intrinsic::x86_sse42_pcmpestri128: {
19909 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19910 Opcode = X86ISD::PCMPISTRI;
19912 Opcode = X86ISD::PCMPESTRI;
19914 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19915 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19916 return DAG.getNode(Opcode, dl, VTs, NewOps);
19919 case Intrinsic::eh_sjlj_lsda: {
19920 MachineFunction &MF = DAG.getMachineFunction();
19921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19922 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19923 auto &Context = MF.getMMI().getContext();
19924 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19925 Twine(MF.getFunctionNumber()));
19926 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19929 case Intrinsic::x86_seh_lsda: {
19930 // Compute the symbol for the LSDA. We know it'll get emitted later.
19931 MachineFunction &MF = DAG.getMachineFunction();
19932 SDValue Op1 = Op.getOperand(1);
19933 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19934 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19935 GlobalValue::getRealLinkageName(Fn->getName()));
19937 // Generate a simple absolute symbol reference. This intrinsic is only
19938 // supported on 32-bit Windows, which isn't PIC.
19939 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19940 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19943 case Intrinsic::x86_seh_recoverfp: {
19944 SDValue FnOp = Op.getOperand(1);
19945 SDValue IncomingFPOp = Op.getOperand(2);
19946 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19947 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19949 report_fatal_error(
19950 "llvm.x86.seh.recoverfp must take a function as the first argument");
19951 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19954 case Intrinsic::localaddress: {
19955 // Returns one of the stack, base, or frame pointer registers, depending on
19956 // which is used to reference local variables.
19957 MachineFunction &MF = DAG.getMachineFunction();
19958 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19960 if (RegInfo->hasBasePointer(MF))
19961 Reg = RegInfo->getBaseRegister();
19962 else // This function handles the SP or FP case.
19963 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19964 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
19969 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19970 SDValue Src, SDValue Mask, SDValue Base,
19971 SDValue Index, SDValue ScaleOp, SDValue Chain,
19972 const X86Subtarget &Subtarget) {
19974 auto *C = cast<ConstantSDNode>(ScaleOp);
19975 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19976 EVT MaskVT = Mask.getValueType();
19977 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
19978 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19979 SDValue Segment = DAG.getRegister(0, MVT::i32);
19980 // If source is undef or we know it won't be used, use a zero vector
19981 // to break register dependency.
19982 // TODO: use undef instead and let ExecutionDepsFix deal with it?
19983 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
19984 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
19985 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
19986 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19987 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
19988 return DAG.getMergeValues(RetOps, dl);
19991 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19992 SDValue Src, SDValue Mask, SDValue Base,
19993 SDValue Index, SDValue ScaleOp, SDValue Chain,
19994 const X86Subtarget &Subtarget) {
19996 auto *C = cast<ConstantSDNode>(ScaleOp);
19997 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19998 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19999 Index.getSimpleValueType().getVectorNumElements());
20001 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20002 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20003 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20004 SDValue Segment = DAG.getRegister(0, MVT::i32);
20005 // If source is undef or we know it won't be used, use a zero vector
20006 // to break register dependency.
20007 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20008 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20009 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20010 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20011 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20012 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20013 return DAG.getMergeValues(RetOps, dl);
20016 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20017 SDValue Src, SDValue Mask, SDValue Base,
20018 SDValue Index, SDValue ScaleOp, SDValue Chain,
20019 const X86Subtarget &Subtarget) {
20021 auto *C = cast<ConstantSDNode>(ScaleOp);
20022 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20023 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20024 SDValue Segment = DAG.getRegister(0, MVT::i32);
20025 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20026 Index.getSimpleValueType().getVectorNumElements());
20028 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20029 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20030 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20031 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20032 return SDValue(Res, 1);
20035 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20036 SDValue Mask, SDValue Base, SDValue Index,
20037 SDValue ScaleOp, SDValue Chain,
20038 const X86Subtarget &Subtarget) {
20040 auto *C = cast<ConstantSDNode>(ScaleOp);
20041 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20042 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20043 SDValue Segment = DAG.getRegister(0, MVT::i32);
20045 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20046 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20047 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20048 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20049 return SDValue(Res, 0);
20052 /// Handles the lowering of builtin intrinsic that return the value
20053 /// of the extended control register.
20054 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20056 const X86Subtarget &Subtarget,
20057 SmallVectorImpl<SDValue> &Results) {
20058 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20059 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20062 // The ECX register is used to select the index of the XCR register to
20065 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20066 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20067 Chain = SDValue(N1, 0);
20069 // Reads the content of XCR and returns it in registers EDX:EAX.
20070 if (Subtarget.is64Bit()) {
20071 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20072 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20075 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20076 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20079 Chain = HI.getValue(1);
20081 if (Subtarget.is64Bit()) {
20082 // Merge the two 32-bit values into a 64-bit one..
20083 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20084 DAG.getConstant(32, DL, MVT::i8));
20085 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20086 Results.push_back(Chain);
20090 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20091 SDValue Ops[] = { LO, HI };
20092 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20093 Results.push_back(Pair);
20094 Results.push_back(Chain);
20097 /// Handles the lowering of builtin intrinsics that read performance monitor
20098 /// counters (x86_rdpmc).
20099 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20101 const X86Subtarget &Subtarget,
20102 SmallVectorImpl<SDValue> &Results) {
20103 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20104 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20107 // The ECX register is used to select the index of the performance counter
20109 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20111 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20113 // Reads the content of a 64-bit performance counter and returns it in the
20114 // registers EDX:EAX.
20115 if (Subtarget.is64Bit()) {
20116 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20117 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20120 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20121 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20124 Chain = HI.getValue(1);
20126 if (Subtarget.is64Bit()) {
20127 // The EAX register is loaded with the low-order 32 bits. The EDX register
20128 // is loaded with the supported high-order bits of the counter.
20129 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20130 DAG.getConstant(32, DL, MVT::i8));
20131 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20132 Results.push_back(Chain);
20136 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20137 SDValue Ops[] = { LO, HI };
20138 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20139 Results.push_back(Pair);
20140 Results.push_back(Chain);
20143 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20144 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20145 /// READCYCLECOUNTER nodes.
20146 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20148 const X86Subtarget &Subtarget,
20149 SmallVectorImpl<SDValue> &Results) {
20150 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20151 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20154 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20155 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20156 // and the EAX register is loaded with the low-order 32 bits.
20157 if (Subtarget.is64Bit()) {
20158 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20159 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20162 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20163 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20166 SDValue Chain = HI.getValue(1);
20168 if (Opcode == X86ISD::RDTSCP_DAG) {
20169 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20171 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20172 // the ECX register. Add 'ecx' explicitly to the chain.
20173 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20175 // Explicitly store the content of ECX at the location passed in input
20176 // to the 'rdtscp' intrinsic.
20177 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20178 MachinePointerInfo());
20181 if (Subtarget.is64Bit()) {
20182 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20183 // the EAX register is loaded with the low-order 32 bits.
20184 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20185 DAG.getConstant(32, DL, MVT::i8));
20186 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20187 Results.push_back(Chain);
20191 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20192 SDValue Ops[] = { LO, HI };
20193 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20194 Results.push_back(Pair);
20195 Results.push_back(Chain);
20198 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20199 SelectionDAG &DAG) {
20200 SmallVector<SDValue, 2> Results;
20202 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20204 return DAG.getMergeValues(Results, DL);
20207 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20208 MachineFunction &MF = DAG.getMachineFunction();
20209 SDValue Chain = Op.getOperand(0);
20210 SDValue RegNode = Op.getOperand(2);
20211 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20213 report_fatal_error("EH registrations only live in functions using WinEH");
20215 // Cast the operand to an alloca, and remember the frame index.
20216 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20218 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20219 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20221 // Return the chain operand without making any DAG nodes.
20225 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20226 MachineFunction &MF = DAG.getMachineFunction();
20227 SDValue Chain = Op.getOperand(0);
20228 SDValue EHGuard = Op.getOperand(2);
20229 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20231 report_fatal_error("EHGuard only live in functions using WinEH");
20233 // Cast the operand to an alloca, and remember the frame index.
20234 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20236 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20237 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20239 // Return the chain operand without making any DAG nodes.
20243 /// Emit Truncating Store with signed or unsigned saturation.
20245 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20246 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20247 SelectionDAG &DAG) {
20249 SDVTList VTs = DAG.getVTList(MVT::Other);
20250 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20251 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20253 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20254 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20257 /// Emit Masked Truncating Store with signed or unsigned saturation.
20259 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20260 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20261 MachineMemOperand *MMO, SelectionDAG &DAG) {
20263 SDVTList VTs = DAG.getVTList(MVT::Other);
20264 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20266 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20267 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20270 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20271 SelectionDAG &DAG) {
20272 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20274 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20277 case llvm::Intrinsic::x86_seh_ehregnode:
20278 return MarkEHRegistrationNode(Op, DAG);
20279 case llvm::Intrinsic::x86_seh_ehguard:
20280 return MarkEHGuard(Op, DAG);
20281 case llvm::Intrinsic::x86_flags_read_u32:
20282 case llvm::Intrinsic::x86_flags_read_u64:
20283 case llvm::Intrinsic::x86_flags_write_u32:
20284 case llvm::Intrinsic::x86_flags_write_u64: {
20285 // We need a frame pointer because this will get lowered to a PUSH/POP
20287 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20288 MFI.setHasCopyImplyingStackAdjustment(true);
20289 // Don't do anything here, we will expand these intrinsics out later
20290 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20293 case Intrinsic::x86_lwpins32:
20294 case Intrinsic::x86_lwpins64: {
20296 SDValue Chain = Op->getOperand(0);
20297 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20299 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20300 Op->getOperand(3), Op->getOperand(4));
20301 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20302 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20303 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20304 LwpIns.getValue(1));
20311 switch(IntrData->Type) {
20312 default: llvm_unreachable("Unknown Intrinsic Type");
20315 // Emit the node with the right value type.
20316 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20317 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20319 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20320 // Otherwise return the value from Rand, which is always 0, casted to i32.
20321 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20322 DAG.getConstant(1, dl, Op->getValueType(1)),
20323 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20324 SDValue(Result.getNode(), 1) };
20325 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20326 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20329 // Return { result, isValid, chain }.
20330 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20331 SDValue(Result.getNode(), 2));
20333 case GATHER_AVX2: {
20334 SDValue Chain = Op.getOperand(0);
20335 SDValue Src = Op.getOperand(2);
20336 SDValue Base = Op.getOperand(3);
20337 SDValue Index = Op.getOperand(4);
20338 SDValue Mask = Op.getOperand(5);
20339 SDValue Scale = Op.getOperand(6);
20340 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20341 Scale, Chain, Subtarget);
20344 //gather(v1, mask, index, base, scale);
20345 SDValue Chain = Op.getOperand(0);
20346 SDValue Src = Op.getOperand(2);
20347 SDValue Base = Op.getOperand(3);
20348 SDValue Index = Op.getOperand(4);
20349 SDValue Mask = Op.getOperand(5);
20350 SDValue Scale = Op.getOperand(6);
20351 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20355 //scatter(base, mask, index, v1, scale);
20356 SDValue Chain = Op.getOperand(0);
20357 SDValue Base = Op.getOperand(2);
20358 SDValue Mask = Op.getOperand(3);
20359 SDValue Index = Op.getOperand(4);
20360 SDValue Src = Op.getOperand(5);
20361 SDValue Scale = Op.getOperand(6);
20362 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20363 Scale, Chain, Subtarget);
20366 SDValue Hint = Op.getOperand(6);
20367 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20368 assert((HintVal == 2 || HintVal == 3) &&
20369 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20370 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20371 SDValue Chain = Op.getOperand(0);
20372 SDValue Mask = Op.getOperand(2);
20373 SDValue Index = Op.getOperand(3);
20374 SDValue Base = Op.getOperand(4);
20375 SDValue Scale = Op.getOperand(5);
20376 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20379 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20381 SmallVector<SDValue, 2> Results;
20382 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20384 return DAG.getMergeValues(Results, dl);
20386 // Read Performance Monitoring Counters.
20388 SmallVector<SDValue, 2> Results;
20389 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20390 return DAG.getMergeValues(Results, dl);
20392 // Get Extended Control Register.
20394 SmallVector<SDValue, 2> Results;
20395 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20396 return DAG.getMergeValues(Results, dl);
20398 // XTEST intrinsics.
20400 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20401 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20403 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20404 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20405 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20406 Ret, SDValue(InTrans.getNode(), 1));
20410 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20411 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20412 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20413 DAG.getConstant(-1, dl, MVT::i8));
20414 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20415 Op.getOperand(4), GenCF.getValue(1));
20416 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20417 Op.getOperand(5), MachinePointerInfo());
20418 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20419 SDValue Results[] = { SetCC, Store };
20420 return DAG.getMergeValues(Results, dl);
20422 case COMPRESS_TO_MEM: {
20423 SDValue Mask = Op.getOperand(4);
20424 SDValue DataToCompress = Op.getOperand(3);
20425 SDValue Addr = Op.getOperand(2);
20426 SDValue Chain = Op.getOperand(0);
20427 MVT VT = DataToCompress.getSimpleValueType();
20429 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20430 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20432 if (isAllOnesConstant(Mask)) // return just a store
20433 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20434 MemIntr->getMemOperand());
20436 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20437 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20439 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20440 MemIntr->getMemOperand(),
20441 false /* truncating */, true /* compressing */);
20443 case TRUNCATE_TO_MEM_VI8:
20444 case TRUNCATE_TO_MEM_VI16:
20445 case TRUNCATE_TO_MEM_VI32: {
20446 SDValue Mask = Op.getOperand(4);
20447 SDValue DataToTruncate = Op.getOperand(3);
20448 SDValue Addr = Op.getOperand(2);
20449 SDValue Chain = Op.getOperand(0);
20451 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20452 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20454 EVT MemVT = MemIntr->getMemoryVT();
20456 uint16_t TruncationOp = IntrData->Opc0;
20457 switch (TruncationOp) {
20458 case X86ISD::VTRUNC: {
20459 if (isAllOnesConstant(Mask)) // return just a truncate store
20460 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20461 MemIntr->getMemOperand());
20463 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20464 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20466 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20467 MemIntr->getMemOperand(), true /* truncating */);
20469 case X86ISD::VTRUNCUS:
20470 case X86ISD::VTRUNCS: {
20471 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20472 if (isAllOnesConstant(Mask))
20473 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20474 MemIntr->getMemOperand(), DAG);
20476 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20477 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20479 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20480 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20483 llvm_unreachable("Unsupported truncstore intrinsic");
20487 case EXPAND_FROM_MEM: {
20488 SDValue Mask = Op.getOperand(4);
20489 SDValue PassThru = Op.getOperand(3);
20490 SDValue Addr = Op.getOperand(2);
20491 SDValue Chain = Op.getOperand(0);
20492 MVT VT = Op.getSimpleValueType();
20494 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20495 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20497 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20498 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20499 if (X86::isZeroNode(Mask))
20500 return DAG.getUNDEF(VT);
20502 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20503 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20504 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20505 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20506 true /* expanding */);
20511 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20512 SelectionDAG &DAG) const {
20513 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20514 MFI.setReturnAddressIsTaken(true);
20516 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20519 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20521 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20524 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20525 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20526 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20527 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20528 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20529 MachinePointerInfo());
20532 // Just load the return address.
20533 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20534 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20535 MachinePointerInfo());
20538 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20539 SelectionDAG &DAG) const {
20540 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20541 return getReturnAddressFrameIndex(DAG);
20544 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20545 MachineFunction &MF = DAG.getMachineFunction();
20546 MachineFrameInfo &MFI = MF.getFrameInfo();
20547 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20548 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20549 EVT VT = Op.getValueType();
20551 MFI.setFrameAddressIsTaken(true);
20553 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20554 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20555 // is not possible to crawl up the stack without looking at the unwind codes
20557 int FrameAddrIndex = FuncInfo->getFAIndex();
20558 if (!FrameAddrIndex) {
20559 // Set up a frame object for the return address.
20560 unsigned SlotSize = RegInfo->getSlotSize();
20561 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20562 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20563 FuncInfo->setFAIndex(FrameAddrIndex);
20565 return DAG.getFrameIndex(FrameAddrIndex, VT);
20568 unsigned FrameReg =
20569 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20570 SDLoc dl(Op); // FIXME probably not meaningful
20571 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20572 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20573 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20574 "Invalid Frame Register!");
20575 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20577 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20578 MachinePointerInfo());
20582 // FIXME? Maybe this could be a TableGen attribute on some registers and
20583 // this table could be generated automatically from RegInfo.
20584 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20585 SelectionDAG &DAG) const {
20586 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20587 const MachineFunction &MF = DAG.getMachineFunction();
20589 unsigned Reg = StringSwitch<unsigned>(RegName)
20590 .Case("esp", X86::ESP)
20591 .Case("rsp", X86::RSP)
20592 .Case("ebp", X86::EBP)
20593 .Case("rbp", X86::RBP)
20596 if (Reg == X86::EBP || Reg == X86::RBP) {
20597 if (!TFI.hasFP(MF))
20598 report_fatal_error("register " + StringRef(RegName) +
20599 " is allocatable: function has no frame pointer");
20602 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20603 unsigned FrameReg =
20604 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20605 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20606 "Invalid Frame Register!");
20614 report_fatal_error("Invalid register name global variable");
20617 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20618 SelectionDAG &DAG) const {
20619 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20620 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20623 unsigned X86TargetLowering::getExceptionPointerRegister(
20624 const Constant *PersonalityFn) const {
20625 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20626 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20628 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20631 unsigned X86TargetLowering::getExceptionSelectorRegister(
20632 const Constant *PersonalityFn) const {
20633 // Funclet personalities don't use selectors (the runtime does the selection).
20634 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20635 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20638 bool X86TargetLowering::needsFixedCatchObjects() const {
20639 return Subtarget.isTargetWin64();
20642 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20643 SDValue Chain = Op.getOperand(0);
20644 SDValue Offset = Op.getOperand(1);
20645 SDValue Handler = Op.getOperand(2);
20648 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20649 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20650 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20651 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20652 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20653 "Invalid Frame Register!");
20654 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20655 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20657 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20658 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20660 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20661 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20662 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20664 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20665 DAG.getRegister(StoreAddrReg, PtrVT));
20668 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20669 SelectionDAG &DAG) const {
20671 // If the subtarget is not 64bit, we may need the global base reg
20672 // after isel expand pseudo, i.e., after CGBR pass ran.
20673 // Therefore, ask for the GlobalBaseReg now, so that the pass
20674 // inserts the code for us in case we need it.
20675 // Otherwise, we will end up in a situation where we will
20676 // reference a virtual register that is not defined!
20677 if (!Subtarget.is64Bit()) {
20678 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20679 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20681 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20682 DAG.getVTList(MVT::i32, MVT::Other),
20683 Op.getOperand(0), Op.getOperand(1));
20686 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20687 SelectionDAG &DAG) const {
20689 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20690 Op.getOperand(0), Op.getOperand(1));
20693 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20694 SelectionDAG &DAG) const {
20696 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20700 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20701 return Op.getOperand(0);
20704 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20705 SelectionDAG &DAG) const {
20706 SDValue Root = Op.getOperand(0);
20707 SDValue Trmp = Op.getOperand(1); // trampoline
20708 SDValue FPtr = Op.getOperand(2); // nested function
20709 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20712 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20713 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20715 if (Subtarget.is64Bit()) {
20716 SDValue OutChains[6];
20718 // Large code-model.
20719 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20720 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20722 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20723 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20725 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20727 // Load the pointer to the nested function into R11.
20728 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20729 SDValue Addr = Trmp;
20730 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20731 Addr, MachinePointerInfo(TrmpAddr));
20733 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20734 DAG.getConstant(2, dl, MVT::i64));
20736 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20737 /* Alignment = */ 2);
20739 // Load the 'nest' parameter value into R10.
20740 // R10 is specified in X86CallingConv.td
20741 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20742 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20743 DAG.getConstant(10, dl, MVT::i64));
20744 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20745 Addr, MachinePointerInfo(TrmpAddr, 10));
20747 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20748 DAG.getConstant(12, dl, MVT::i64));
20750 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20751 /* Alignment = */ 2);
20753 // Jump to the nested function.
20754 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20755 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20756 DAG.getConstant(20, dl, MVT::i64));
20757 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20758 Addr, MachinePointerInfo(TrmpAddr, 20));
20760 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20761 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20762 DAG.getConstant(22, dl, MVT::i64));
20763 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20764 Addr, MachinePointerInfo(TrmpAddr, 22));
20766 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20768 const Function *Func =
20769 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20770 CallingConv::ID CC = Func->getCallingConv();
20775 llvm_unreachable("Unsupported calling convention");
20776 case CallingConv::C:
20777 case CallingConv::X86_StdCall: {
20778 // Pass 'nest' parameter in ECX.
20779 // Must be kept in sync with X86CallingConv.td
20780 NestReg = X86::ECX;
20782 // Check that ECX wasn't needed by an 'inreg' parameter.
20783 FunctionType *FTy = Func->getFunctionType();
20784 const AttributeList &Attrs = Func->getAttributes();
20786 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20787 unsigned InRegCount = 0;
20790 for (FunctionType::param_iterator I = FTy->param_begin(),
20791 E = FTy->param_end(); I != E; ++I, ++Idx)
20792 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20793 auto &DL = DAG.getDataLayout();
20794 // FIXME: should only count parameters that are lowered to integers.
20795 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20798 if (InRegCount > 2) {
20799 report_fatal_error("Nest register in use - reduce number of inreg"
20805 case CallingConv::X86_FastCall:
20806 case CallingConv::X86_ThisCall:
20807 case CallingConv::Fast:
20808 // Pass 'nest' parameter in EAX.
20809 // Must be kept in sync with X86CallingConv.td
20810 NestReg = X86::EAX;
20814 SDValue OutChains[4];
20815 SDValue Addr, Disp;
20817 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20818 DAG.getConstant(10, dl, MVT::i32));
20819 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20821 // This is storing the opcode for MOV32ri.
20822 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20823 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20825 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20826 Trmp, MachinePointerInfo(TrmpAddr));
20828 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20829 DAG.getConstant(1, dl, MVT::i32));
20831 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20832 /* Alignment = */ 1);
20834 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20835 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20836 DAG.getConstant(5, dl, MVT::i32));
20837 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20838 Addr, MachinePointerInfo(TrmpAddr, 5),
20839 /* Alignment = */ 1);
20841 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20842 DAG.getConstant(6, dl, MVT::i32));
20844 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20845 /* Alignment = */ 1);
20847 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20851 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20852 SelectionDAG &DAG) const {
20854 The rounding mode is in bits 11:10 of FPSR, and has the following
20856 00 Round to nearest
20861 FLT_ROUNDS, on the other hand, expects the following:
20868 To perform the conversion, we do:
20869 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20872 MachineFunction &MF = DAG.getMachineFunction();
20873 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20874 unsigned StackAlignment = TFI.getStackAlignment();
20875 MVT VT = Op.getSimpleValueType();
20878 // Save FP Control Word to stack slot
20879 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20880 SDValue StackSlot =
20881 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20883 MachineMemOperand *MMO =
20884 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20885 MachineMemOperand::MOStore, 2, 2);
20887 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20888 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20889 DAG.getVTList(MVT::Other),
20890 Ops, MVT::i16, MMO);
20892 // Load FP Control Word from stack slot
20894 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20896 // Transform as necessary
20898 DAG.getNode(ISD::SRL, DL, MVT::i16,
20899 DAG.getNode(ISD::AND, DL, MVT::i16,
20900 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20901 DAG.getConstant(11, DL, MVT::i8));
20903 DAG.getNode(ISD::SRL, DL, MVT::i16,
20904 DAG.getNode(ISD::AND, DL, MVT::i16,
20905 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20906 DAG.getConstant(9, DL, MVT::i8));
20909 DAG.getNode(ISD::AND, DL, MVT::i16,
20910 DAG.getNode(ISD::ADD, DL, MVT::i16,
20911 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20912 DAG.getConstant(1, DL, MVT::i16)),
20913 DAG.getConstant(3, DL, MVT::i16));
20915 return DAG.getNode((VT.getSizeInBits() < 16 ?
20916 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20919 // Split an unary integer op into 2 half sized ops.
20920 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
20921 MVT VT = Op.getSimpleValueType();
20922 unsigned NumElems = VT.getVectorNumElements();
20923 unsigned SizeInBits = VT.getSizeInBits();
20925 // Extract the Lo/Hi vectors
20927 SDValue Src = Op.getOperand(0);
20928 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
20929 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
20931 MVT EltVT = VT.getVectorElementType();
20932 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
20933 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20934 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
20935 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
20938 // Decompose 256-bit ops into smaller 128-bit ops.
20939 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
20940 assert(Op.getSimpleValueType().is256BitVector() &&
20941 Op.getSimpleValueType().isInteger() &&
20942 "Only handle AVX 256-bit vector integer operation");
20943 return LowerVectorIntUnary(Op, DAG);
20946 // Decompose 512-bit ops into smaller 256-bit ops.
20947 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
20948 assert(Op.getSimpleValueType().is512BitVector() &&
20949 Op.getSimpleValueType().isInteger() &&
20950 "Only handle AVX 512-bit vector integer operation");
20951 return LowerVectorIntUnary(Op, DAG);
20954 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20956 // i8/i16 vector implemented using dword LZCNT vector instruction
20957 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20958 // split the vector, perform operation on it's Lo a Hi part and
20959 // concatenate the results.
20960 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
20961 assert(Op.getOpcode() == ISD::CTLZ);
20963 MVT VT = Op.getSimpleValueType();
20964 MVT EltVT = VT.getVectorElementType();
20965 unsigned NumElems = VT.getVectorNumElements();
20967 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
20968 "Unsupported element type");
20970 // Split vector, it's Lo and Hi parts will be handled in next iteration.
20972 return LowerVectorIntUnary(Op, DAG);
20974 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
20975 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
20976 "Unsupported value type for operation");
20978 // Use native supported vector instruction vplzcntd.
20979 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
20980 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
20981 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
20982 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
20984 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
20987 // Lower CTLZ using a PSHUFB lookup table implementation.
20988 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
20989 const X86Subtarget &Subtarget,
20990 SelectionDAG &DAG) {
20991 MVT VT = Op.getSimpleValueType();
20992 int NumElts = VT.getVectorNumElements();
20993 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
20994 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
20996 // Per-nibble leading zero PSHUFB lookup table.
20997 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
20998 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
20999 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21000 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21002 SmallVector<SDValue, 64> LUTVec;
21003 for (int i = 0; i < NumBytes; ++i)
21004 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21005 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21007 // Begin by bitcasting the input to byte vector, then split those bytes
21008 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21009 // If the hi input nibble is zero then we add both results together, otherwise
21010 // we just take the hi result (by masking the lo result to zero before the
21012 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21013 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21015 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21016 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21017 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21018 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21019 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21021 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21022 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21023 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21024 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21026 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21027 // of the current vector width in the same way we did for the nibbles.
21028 // If the upper half of the input element is zero then add the halves'
21029 // leading zero counts together, otherwise just use the upper half's.
21030 // Double the width of the result until we are at target width.
21031 while (CurrVT != VT) {
21032 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21033 int CurrNumElts = CurrVT.getVectorNumElements();
21034 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21035 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21036 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21038 // Check if the upper half of the input element is zero.
21039 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21040 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21041 HiZ = DAG.getBitcast(NextVT, HiZ);
21043 // Move the upper/lower halves to the lower bits as we'll be extending to
21044 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21046 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21047 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21048 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21049 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21050 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21057 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21058 const X86Subtarget &Subtarget,
21059 SelectionDAG &DAG) {
21060 MVT VT = Op.getSimpleValueType();
21062 if (Subtarget.hasCDI())
21063 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21065 // Decompose 256-bit ops into smaller 128-bit ops.
21066 if (VT.is256BitVector() && !Subtarget.hasInt256())
21067 return Lower256IntUnary(Op, DAG);
21069 // Decompose 512-bit ops into smaller 256-bit ops.
21070 if (VT.is512BitVector() && !Subtarget.hasBWI())
21071 return Lower512IntUnary(Op, DAG);
21073 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21074 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21077 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21078 SelectionDAG &DAG) {
21079 MVT VT = Op.getSimpleValueType();
21081 unsigned NumBits = VT.getSizeInBits();
21083 unsigned Opc = Op.getOpcode();
21086 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21088 Op = Op.getOperand(0);
21089 if (VT == MVT::i8) {
21090 // Zero extend to i32 since there is not an i8 bsr.
21092 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21095 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21096 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21097 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21099 if (Opc == ISD::CTLZ) {
21100 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21103 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21104 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21107 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21110 // Finally xor with NumBits-1.
21111 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21112 DAG.getConstant(NumBits - 1, dl, OpVT));
21115 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21119 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21120 MVT VT = Op.getSimpleValueType();
21121 unsigned NumBits = VT.getScalarSizeInBits();
21124 if (VT.isVector()) {
21125 SDValue N0 = Op.getOperand(0);
21126 SDValue Zero = DAG.getConstant(0, dl, VT);
21128 // lsb(x) = (x & -x)
21129 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21130 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21132 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21133 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21134 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21135 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21136 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21139 // cttz(x) = ctpop(lsb - 1)
21140 SDValue One = DAG.getConstant(1, dl, VT);
21141 return DAG.getNode(ISD::CTPOP, dl, VT,
21142 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21145 assert(Op.getOpcode() == ISD::CTTZ &&
21146 "Only scalar CTTZ requires custom lowering");
21148 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21149 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21150 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21152 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21155 DAG.getConstant(NumBits, dl, VT),
21156 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21159 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21162 /// Break a 256-bit integer operation into two new 128-bit ones and then
21163 /// concatenate the result back.
21164 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21165 MVT VT = Op.getSimpleValueType();
21167 assert(VT.is256BitVector() && VT.isInteger() &&
21168 "Unsupported value type for operation");
21170 unsigned NumElems = VT.getVectorNumElements();
21173 // Extract the LHS vectors
21174 SDValue LHS = Op.getOperand(0);
21175 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21176 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21178 // Extract the RHS vectors
21179 SDValue RHS = Op.getOperand(1);
21180 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21181 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21183 MVT EltVT = VT.getVectorElementType();
21184 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21186 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21187 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21188 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21191 /// Break a 512-bit integer operation into two new 256-bit ones and then
21192 /// concatenate the result back.
21193 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21194 MVT VT = Op.getSimpleValueType();
21196 assert(VT.is512BitVector() && VT.isInteger() &&
21197 "Unsupported value type for operation");
21199 unsigned NumElems = VT.getVectorNumElements();
21202 // Extract the LHS vectors
21203 SDValue LHS = Op.getOperand(0);
21204 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21205 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21207 // Extract the RHS vectors
21208 SDValue RHS = Op.getOperand(1);
21209 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21210 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21212 MVT EltVT = VT.getVectorElementType();
21213 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21215 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21216 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21217 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21220 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21221 MVT VT = Op.getSimpleValueType();
21222 if (VT.getScalarType() == MVT::i1)
21223 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21224 Op.getOperand(0), Op.getOperand(1));
21225 assert(Op.getSimpleValueType().is256BitVector() &&
21226 Op.getSimpleValueType().isInteger() &&
21227 "Only handle AVX 256-bit vector integer operation");
21228 return Lower256IntArith(Op, DAG);
21231 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21232 assert(Op.getSimpleValueType().is256BitVector() &&
21233 Op.getSimpleValueType().isInteger() &&
21234 "Only handle AVX 256-bit vector integer operation");
21235 return Lower256IntUnary(Op, DAG);
21238 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21239 assert(Op.getSimpleValueType().is256BitVector() &&
21240 Op.getSimpleValueType().isInteger() &&
21241 "Only handle AVX 256-bit vector integer operation");
21242 return Lower256IntArith(Op, DAG);
21245 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21246 SelectionDAG &DAG) {
21248 MVT VT = Op.getSimpleValueType();
21250 if (VT.getScalarType() == MVT::i1)
21251 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21253 // Decompose 256-bit ops into smaller 128-bit ops.
21254 if (VT.is256BitVector() && !Subtarget.hasInt256())
21255 return Lower256IntArith(Op, DAG);
21257 SDValue A = Op.getOperand(0);
21258 SDValue B = Op.getOperand(1);
21260 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21261 // vector pairs, multiply and truncate.
21262 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21263 if (Subtarget.hasInt256()) {
21264 // For 512-bit vectors, split into 256-bit vectors to allow the
21265 // sign-extension to occur.
21266 if (VT == MVT::v64i8)
21267 return Lower512IntArith(Op, DAG);
21269 // For 256-bit vectors, split into 128-bit vectors to allow the
21270 // sign-extension to occur. We don't need this on AVX512BW as we can
21271 // safely sign-extend to v32i16.
21272 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21273 return Lower256IntArith(Op, DAG);
21275 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21276 return DAG.getNode(
21277 ISD::TRUNCATE, dl, VT,
21278 DAG.getNode(ISD::MUL, dl, ExVT,
21279 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21280 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21283 assert(VT == MVT::v16i8 &&
21284 "Pre-AVX2 support only supports v16i8 multiplication");
21285 MVT ExVT = MVT::v8i16;
21287 // Extract the lo parts and sign extend to i16
21289 if (Subtarget.hasSSE41()) {
21290 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21291 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21293 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21294 -1, 4, -1, 5, -1, 6, -1, 7};
21295 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21296 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21297 ALo = DAG.getBitcast(ExVT, ALo);
21298 BLo = DAG.getBitcast(ExVT, BLo);
21299 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21300 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21303 // Extract the hi parts and sign extend to i16
21305 if (Subtarget.hasSSE41()) {
21306 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21307 -1, -1, -1, -1, -1, -1, -1, -1};
21308 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21309 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21310 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21311 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21313 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21314 -1, 12, -1, 13, -1, 14, -1, 15};
21315 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21316 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21317 AHi = DAG.getBitcast(ExVT, AHi);
21318 BHi = DAG.getBitcast(ExVT, BHi);
21319 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21320 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21323 // Multiply, mask the lower 8bits of the lo/hi results and pack
21324 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21325 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21326 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21327 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21328 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21331 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21332 if (VT == MVT::v4i32) {
21333 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21334 "Should not custom lower when pmuldq is available!");
21336 // Extract the odd parts.
21337 static const int UnpackMask[] = { 1, -1, 3, -1 };
21338 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21339 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21341 // Multiply the even parts.
21342 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21343 // Now multiply odd parts.
21344 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21346 Evens = DAG.getBitcast(VT, Evens);
21347 Odds = DAG.getBitcast(VT, Odds);
21349 // Merge the two vectors back together with a shuffle. This expands into 2
21351 static const int ShufMask[] = { 0, 4, 2, 6 };
21352 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21355 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21356 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21358 // 32-bit vector types used for MULDQ/MULUDQ.
21359 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21361 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21362 // 32-bits. We can lower with this if the sign bits stretch that far.
21363 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21364 DAG.ComputeNumSignBits(B) > 32) {
21365 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21366 DAG.getBitcast(MulVT, B));
21369 // Ahi = psrlqi(a, 32);
21370 // Bhi = psrlqi(b, 32);
21372 // AloBlo = pmuludq(a, b);
21373 // AloBhi = pmuludq(a, Bhi);
21374 // AhiBlo = pmuludq(Ahi, b);
21376 // Hi = psllqi(AloBhi + AhiBlo, 32);
21377 // return AloBlo + Hi;
21378 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21379 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21380 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21382 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21383 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21384 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21386 // Bit cast to 32-bit vectors for MULUDQ.
21387 SDValue Alo = DAG.getBitcast(MulVT, A);
21388 SDValue Blo = DAG.getBitcast(MulVT, B);
21390 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21392 // Only multiply lo/hi halves that aren't known to be zero.
21393 SDValue AloBlo = Zero;
21394 if (!ALoIsZero && !BLoIsZero)
21395 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21397 SDValue AloBhi = Zero;
21398 if (!ALoIsZero && !BHiIsZero) {
21399 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21400 Bhi = DAG.getBitcast(MulVT, Bhi);
21401 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21404 SDValue AhiBlo = Zero;
21405 if (!AHiIsZero && !BLoIsZero) {
21406 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21407 Ahi = DAG.getBitcast(MulVT, Ahi);
21408 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21411 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21412 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21414 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21417 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21418 SelectionDAG &DAG) {
21420 MVT VT = Op.getSimpleValueType();
21422 // Decompose 256-bit ops into smaller 128-bit ops.
21423 if (VT.is256BitVector() && !Subtarget.hasInt256())
21424 return Lower256IntArith(Op, DAG);
21426 // Only i8 vectors should need custom lowering after this.
21427 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21428 "Unsupported vector type");
21430 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21431 // logical shift down the upper half and pack back to i8.
21432 SDValue A = Op.getOperand(0);
21433 SDValue B = Op.getOperand(1);
21435 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21436 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21437 unsigned Opcode = Op.getOpcode();
21438 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21439 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21441 // AVX2 implementations - extend xmm subvectors to ymm.
21442 if (Subtarget.hasInt256()) {
21443 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21444 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21446 if (VT == MVT::v32i8) {
21447 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21448 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21449 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21450 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21451 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21452 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21453 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21454 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21455 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21456 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21457 DAG.getConstant(8, dl, MVT::v16i16));
21458 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21459 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21460 DAG.getConstant(8, dl, MVT::v16i16));
21461 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21462 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21463 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21464 16, 17, 18, 19, 20, 21, 22, 23};
21465 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21466 24, 25, 26, 27, 28, 29, 30, 31};
21467 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21468 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21469 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21472 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21473 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21474 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21475 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21476 DAG.getConstant(8, dl, MVT::v16i16));
21477 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21478 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21479 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21482 assert(VT == MVT::v16i8 &&
21483 "Pre-AVX2 support only supports v16i8 multiplication");
21484 MVT ExVT = MVT::v8i16;
21486 // Extract the lo parts and zero/sign extend to i16.
21488 if (Subtarget.hasSSE41()) {
21489 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21490 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21492 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21493 -1, 4, -1, 5, -1, 6, -1, 7};
21494 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21495 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21496 ALo = DAG.getBitcast(ExVT, ALo);
21497 BLo = DAG.getBitcast(ExVT, BLo);
21498 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21499 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21502 // Extract the hi parts and zero/sign extend to i16.
21504 if (Subtarget.hasSSE41()) {
21505 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21506 -1, -1, -1, -1, -1, -1, -1, -1};
21507 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21508 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21509 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21510 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21512 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21513 -1, 12, -1, 13, -1, 14, -1, 15};
21514 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21515 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21516 AHi = DAG.getBitcast(ExVT, AHi);
21517 BHi = DAG.getBitcast(ExVT, BHi);
21518 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21519 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21522 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21523 // pack back to v16i8.
21524 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21525 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21526 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21527 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21528 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21531 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21532 assert(Subtarget.isTargetWin64() && "Unexpected target");
21533 EVT VT = Op.getValueType();
21534 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21535 "Unexpected return type for lowering");
21539 switch (Op->getOpcode()) {
21540 default: llvm_unreachable("Unexpected request for libcall!");
21541 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21542 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21543 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21544 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21545 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21546 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21550 SDValue InChain = DAG.getEntryNode();
21552 TargetLowering::ArgListTy Args;
21553 TargetLowering::ArgListEntry Entry;
21554 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21555 EVT ArgVT = Op->getOperand(i).getValueType();
21556 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21557 "Unexpected argument type for lowering");
21558 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21559 Entry.Node = StackPtr;
21560 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21561 MachinePointerInfo(), /* Alignment = */ 16);
21562 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21563 Entry.Ty = PointerType::get(ArgTy,0);
21564 Entry.IsSExt = false;
21565 Entry.IsZExt = false;
21566 Args.push_back(Entry);
21569 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21570 getPointerTy(DAG.getDataLayout()));
21572 TargetLowering::CallLoweringInfo CLI(DAG);
21573 CLI.setDebugLoc(dl)
21576 getLibcallCallingConv(LC),
21577 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21580 .setSExtResult(isSigned)
21581 .setZExtResult(!isSigned);
21583 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21584 return DAG.getBitcast(VT, CallInfo.first);
21587 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21588 SelectionDAG &DAG) {
21589 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21590 MVT VT = Op0.getSimpleValueType();
21593 // Decompose 256-bit ops into smaller 128-bit ops.
21594 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21595 unsigned Opcode = Op.getOpcode();
21596 unsigned NumElems = VT.getVectorNumElements();
21597 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21598 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21599 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21600 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21601 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21602 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21603 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21605 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21606 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21608 return DAG.getMergeValues(Ops, dl);
21611 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21612 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21614 // PMULxD operations multiply each even value (starting at 0) of LHS with
21615 // the related value of RHS and produce a widen result.
21616 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21617 // => <2 x i64> <ae|cg>
21619 // In other word, to have all the results, we need to perform two PMULxD:
21620 // 1. one with the even values.
21621 // 2. one with the odd values.
21622 // To achieve #2, with need to place the odd values at an even position.
21624 // Place the odd value at an even position (basically, shift all values 1
21625 // step to the left):
21626 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21627 // <a|b|c|d> => <b|undef|d|undef>
21628 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21629 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21630 // <e|f|g|h> => <f|undef|h|undef>
21631 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21632 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21634 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21636 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21637 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21639 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21640 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21641 // => <2 x i64> <ae|cg>
21642 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21643 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21644 // => <2 x i64> <bf|dh>
21645 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21647 // Shuffle it back into the right order.
21648 SDValue Highs, Lows;
21649 if (VT == MVT::v8i32) {
21650 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21651 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21652 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21653 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21655 const int HighMask[] = {1, 5, 3, 7};
21656 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21657 const int LowMask[] = {0, 4, 2, 6};
21658 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21661 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21662 // unsigned multiply.
21663 if (IsSigned && !Subtarget.hasSSE41()) {
21664 SDValue ShAmt = DAG.getConstant(
21666 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21667 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21668 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21669 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21670 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21672 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21673 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21676 // The first result of MUL_LOHI is actually the low value, followed by the
21678 SDValue Ops[] = {Lows, Highs};
21679 return DAG.getMergeValues(Ops, dl);
21682 // Return true if the required (according to Opcode) shift-imm form is natively
21683 // supported by the Subtarget
21684 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21686 if (VT.getScalarSizeInBits() < 16)
21689 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21690 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21693 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21694 (VT.is256BitVector() && Subtarget.hasInt256());
21696 bool AShift = LShift && (Subtarget.hasAVX512() ||
21697 (VT != MVT::v2i64 && VT != MVT::v4i64));
21698 return (Opcode == ISD::SRA) ? AShift : LShift;
21701 // The shift amount is a variable, but it is the same for all vector lanes.
21702 // These instructions are defined together with shift-immediate.
21704 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21706 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21709 // Return true if the required (according to Opcode) variable-shift form is
21710 // natively supported by the Subtarget
21711 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21714 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21717 // vXi16 supported only on AVX-512, BWI
21718 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21721 if (Subtarget.hasAVX512())
21724 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21725 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21726 return (Opcode == ISD::SRA) ? AShift : LShift;
21729 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21730 const X86Subtarget &Subtarget) {
21731 MVT VT = Op.getSimpleValueType();
21733 SDValue R = Op.getOperand(0);
21734 SDValue Amt = Op.getOperand(1);
21736 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21737 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21739 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21740 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21741 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21742 SDValue Ex = DAG.getBitcast(ExVT, R);
21744 if (ShiftAmt >= 32) {
21745 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21747 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21748 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21749 ShiftAmt - 32, DAG);
21750 if (VT == MVT::v2i64)
21751 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21752 if (VT == MVT::v4i64)
21753 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21754 {9, 1, 11, 3, 13, 5, 15, 7});
21756 // SRA upper i32, SHL whole i64 and select lower i32.
21757 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21760 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21761 Lower = DAG.getBitcast(ExVT, Lower);
21762 if (VT == MVT::v2i64)
21763 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21764 if (VT == MVT::v4i64)
21765 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21766 {8, 1, 10, 3, 12, 5, 14, 7});
21768 return DAG.getBitcast(VT, Ex);
21771 // Optimize shl/srl/sra with constant shift amount.
21772 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21773 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21774 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21776 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21777 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21779 // i64 SRA needs to be performed as partial shifts.
21780 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21781 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21782 return ArithmeticShiftRight64(ShiftAmt);
21784 if (VT == MVT::v16i8 ||
21785 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21786 VT == MVT::v64i8) {
21787 unsigned NumElts = VT.getVectorNumElements();
21788 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21790 // Simple i8 add case
21791 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21792 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21794 // ashr(R, 7) === cmp_slt(R, 0)
21795 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21796 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21797 if (VT.is512BitVector()) {
21798 assert(VT == MVT::v64i8 && "Unexpected element type!");
21799 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21800 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21802 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21805 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21806 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21809 if (Op.getOpcode() == ISD::SHL) {
21810 // Make a large shift.
21811 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21813 SHL = DAG.getBitcast(VT, SHL);
21814 // Zero out the rightmost bits.
21815 return DAG.getNode(ISD::AND, dl, VT, SHL,
21816 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21818 if (Op.getOpcode() == ISD::SRL) {
21819 // Make a large shift.
21820 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21822 SRL = DAG.getBitcast(VT, SRL);
21823 // Zero out the leftmost bits.
21824 return DAG.getNode(ISD::AND, dl, VT, SRL,
21825 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21827 if (Op.getOpcode() == ISD::SRA) {
21828 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21829 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21831 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21832 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21833 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21836 llvm_unreachable("Unknown shift opcode.");
21841 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21842 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21843 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21844 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21846 // Peek through any splat that was introduced for i64 shift vectorization.
21847 int SplatIndex = -1;
21848 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21849 if (SVN->isSplat()) {
21850 SplatIndex = SVN->getSplatIndex();
21851 Amt = Amt.getOperand(0);
21852 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21853 "Splat shuffle referencing second operand");
21856 if (Amt.getOpcode() != ISD::BITCAST ||
21857 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21860 Amt = Amt.getOperand(0);
21861 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21862 VT.getVectorNumElements();
21863 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21864 uint64_t ShiftAmt = 0;
21865 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21866 for (unsigned i = 0; i != Ratio; ++i) {
21867 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21871 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21874 // Check remaining shift amounts (if not a splat).
21875 if (SplatIndex < 0) {
21876 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21877 uint64_t ShAmt = 0;
21878 for (unsigned j = 0; j != Ratio; ++j) {
21879 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21883 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21885 if (ShAmt != ShiftAmt)
21890 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21891 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21893 if (Op.getOpcode() == ISD::SRA)
21894 return ArithmeticShiftRight64(ShiftAmt);
21900 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21901 const X86Subtarget &Subtarget) {
21902 MVT VT = Op.getSimpleValueType();
21904 SDValue R = Op.getOperand(0);
21905 SDValue Amt = Op.getOperand(1);
21907 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21908 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21910 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21911 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21913 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21915 MVT EltVT = VT.getVectorElementType();
21917 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21918 // Check if this build_vector node is doing a splat.
21919 // If so, then set BaseShAmt equal to the splat value.
21920 BaseShAmt = BV->getSplatValue();
21921 if (BaseShAmt && BaseShAmt.isUndef())
21922 BaseShAmt = SDValue();
21924 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21925 Amt = Amt.getOperand(0);
21927 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21928 if (SVN && SVN->isSplat()) {
21929 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21930 SDValue InVec = Amt.getOperand(0);
21931 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21932 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21933 "Unexpected shuffle index found!");
21934 BaseShAmt = InVec.getOperand(SplatIdx);
21935 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21936 if (ConstantSDNode *C =
21937 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21938 if (C->getZExtValue() == SplatIdx)
21939 BaseShAmt = InVec.getOperand(1);
21944 // Avoid introducing an extract element from a shuffle.
21945 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21946 DAG.getIntPtrConstant(SplatIdx, dl));
21950 if (BaseShAmt.getNode()) {
21951 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
21952 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
21953 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
21954 else if (EltVT.bitsLT(MVT::i32))
21955 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
21957 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
21961 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21962 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
21963 Amt.getOpcode() == ISD::BITCAST &&
21964 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
21965 Amt = Amt.getOperand(0);
21966 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21967 VT.getVectorNumElements();
21968 std::vector<SDValue> Vals(Ratio);
21969 for (unsigned i = 0; i != Ratio; ++i)
21970 Vals[i] = Amt.getOperand(i);
21971 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21972 for (unsigned j = 0; j != Ratio; ++j)
21973 if (Vals[j] != Amt.getOperand(i + j))
21977 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
21978 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
21983 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
21984 SelectionDAG &DAG) {
21985 MVT VT = Op.getSimpleValueType();
21987 SDValue R = Op.getOperand(0);
21988 SDValue Amt = Op.getOperand(1);
21989 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21991 assert(VT.isVector() && "Custom lowering only for vector shifts!");
21992 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
21994 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
21997 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22000 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22003 // XOP has 128-bit variable logical/arithmetic shifts.
22004 // +ve/-ve Amt = shift left/right.
22005 if (Subtarget.hasXOP() &&
22006 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22007 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22008 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22009 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22010 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22012 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22013 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22014 if (Op.getOpcode() == ISD::SRA)
22015 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22018 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22019 // shifts per-lane and then shuffle the partial results back together.
22020 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22021 // Splat the shift amounts so the scalar shifts above will catch it.
22022 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22023 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22024 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22025 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22026 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22029 // i64 vector arithmetic shift can be emulated with the transform:
22030 // M = lshr(SIGN_MASK, Amt)
22031 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22032 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22033 Op.getOpcode() == ISD::SRA) {
22034 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22035 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22036 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22037 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22038 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22042 // If possible, lower this packed shift into a vector multiply instead of
22043 // expanding it into a sequence of scalar shifts.
22044 // Do this only if the vector shift count is a constant build_vector.
22045 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22046 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22047 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22048 SmallVector<SDValue, 8> Elts;
22049 MVT SVT = VT.getVectorElementType();
22050 unsigned SVTBits = SVT.getSizeInBits();
22051 APInt One(SVTBits, 1);
22052 unsigned NumElems = VT.getVectorNumElements();
22054 for (unsigned i=0; i !=NumElems; ++i) {
22055 SDValue Op = Amt->getOperand(i);
22056 if (Op->isUndef()) {
22057 Elts.push_back(Op);
22061 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22062 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22063 uint64_t ShAmt = C.getZExtValue();
22064 if (ShAmt >= SVTBits) {
22065 Elts.push_back(DAG.getUNDEF(SVT));
22068 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22070 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22071 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22074 // Lower SHL with variable shift amount.
22075 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22076 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22078 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22079 DAG.getConstant(0x3f800000U, dl, VT));
22080 Op = DAG.getBitcast(MVT::v4f32, Op);
22081 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22082 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22085 // If possible, lower this shift as a sequence of two shifts by
22086 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22088 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22090 // Could be rewritten as:
22091 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22093 // The advantage is that the two shifts from the example would be
22094 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22095 // the vector shift into four scalar shifts plus four pairs of vector
22097 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22098 unsigned TargetOpcode = X86ISD::MOVSS;
22099 bool CanBeSimplified;
22100 // The splat value for the first packed shift (the 'X' from the example).
22101 SDValue Amt1 = Amt->getOperand(0);
22102 // The splat value for the second packed shift (the 'Y' from the example).
22103 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22105 // See if it is possible to replace this node with a sequence of
22106 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22107 if (VT == MVT::v4i32) {
22108 // Check if it is legal to use a MOVSS.
22109 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22110 Amt2 == Amt->getOperand(3);
22111 if (!CanBeSimplified) {
22112 // Otherwise, check if we can still simplify this node using a MOVSD.
22113 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22114 Amt->getOperand(2) == Amt->getOperand(3);
22115 TargetOpcode = X86ISD::MOVSD;
22116 Amt2 = Amt->getOperand(2);
22119 // Do similar checks for the case where the machine value type
22121 CanBeSimplified = Amt1 == Amt->getOperand(1);
22122 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22123 CanBeSimplified = Amt2 == Amt->getOperand(i);
22125 if (!CanBeSimplified) {
22126 TargetOpcode = X86ISD::MOVSD;
22127 CanBeSimplified = true;
22128 Amt2 = Amt->getOperand(4);
22129 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22130 CanBeSimplified = Amt1 == Amt->getOperand(i);
22131 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22132 CanBeSimplified = Amt2 == Amt->getOperand(j);
22136 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22137 isa<ConstantSDNode>(Amt2)) {
22138 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22139 MVT CastVT = MVT::v4i32;
22141 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22142 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22144 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22145 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22146 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22147 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22148 if (TargetOpcode == X86ISD::MOVSD)
22149 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22150 BitCast2, {0, 1, 6, 7}));
22151 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22152 BitCast2, {0, 5, 6, 7}));
22156 // v4i32 Non Uniform Shifts.
22157 // If the shift amount is constant we can shift each lane using the SSE2
22158 // immediate shifts, else we need to zero-extend each lane to the lower i64
22159 // and shift using the SSE2 variable shifts.
22160 // The separate results can then be blended together.
22161 if (VT == MVT::v4i32) {
22162 unsigned Opc = Op.getOpcode();
22163 SDValue Amt0, Amt1, Amt2, Amt3;
22165 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22166 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22167 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22168 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22170 // ISD::SHL is handled above but we include it here for completeness.
22173 llvm_unreachable("Unknown target vector shift node");
22175 Opc = X86ISD::VSHL;
22178 Opc = X86ISD::VSRL;
22181 Opc = X86ISD::VSRA;
22184 // The SSE2 shifts use the lower i64 as the same shift amount for
22185 // all lanes and the upper i64 is ignored. These shuffle masks
22186 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22187 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22188 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22189 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22190 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22191 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22194 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22195 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22196 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22197 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22198 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22199 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22200 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22203 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22204 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22205 // make the existing SSE solution better.
22206 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22207 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22208 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22209 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22210 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22211 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22213 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22214 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22215 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22216 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22217 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22220 if (VT == MVT::v16i8 ||
22221 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22222 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22223 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22224 unsigned ShiftOpcode = Op->getOpcode();
22226 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22227 if (VT.is512BitVector()) {
22228 // On AVX512BW targets we make use of the fact that VSELECT lowers
22229 // to a masked blend which selects bytes based just on the sign bit
22230 // extracted to a mask.
22231 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22232 V0 = DAG.getBitcast(VT, V0);
22233 V1 = DAG.getBitcast(VT, V1);
22234 Sel = DAG.getBitcast(VT, Sel);
22235 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22236 return DAG.getBitcast(SelVT,
22237 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
22238 } else if (Subtarget.hasSSE41()) {
22239 // On SSE41 targets we make use of the fact that VSELECT lowers
22240 // to PBLENDVB which selects bytes based just on the sign bit.
22241 V0 = DAG.getBitcast(VT, V0);
22242 V1 = DAG.getBitcast(VT, V1);
22243 Sel = DAG.getBitcast(VT, Sel);
22244 return DAG.getBitcast(SelVT,
22245 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
22247 // On pre-SSE41 targets we test for the sign bit by comparing to
22248 // zero - a negative value will set all bits of the lanes to true
22249 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22250 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22251 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22252 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
22255 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22256 // We can safely do this using i16 shifts as we're only interested in
22257 // the 3 lower bits of each byte.
22258 Amt = DAG.getBitcast(ExtVT, Amt);
22259 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22260 Amt = DAG.getBitcast(VT, Amt);
22262 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22263 // r = VSELECT(r, shift(r, 4), a);
22265 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22266 R = SignBitSelect(VT, Amt, M, R);
22269 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22271 // r = VSELECT(r, shift(r, 2), a);
22272 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22273 R = SignBitSelect(VT, Amt, M, R);
22276 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22278 // return VSELECT(r, shift(r, 1), a);
22279 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22280 R = SignBitSelect(VT, Amt, M, R);
22284 if (Op->getOpcode() == ISD::SRA) {
22285 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22286 // so we can correctly sign extend. We don't care what happens to the
22288 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22289 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22290 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22291 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22292 ALo = DAG.getBitcast(ExtVT, ALo);
22293 AHi = DAG.getBitcast(ExtVT, AHi);
22294 RLo = DAG.getBitcast(ExtVT, RLo);
22295 RHi = DAG.getBitcast(ExtVT, RHi);
22297 // r = VSELECT(r, shift(r, 4), a);
22298 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22299 DAG.getConstant(4, dl, ExtVT));
22300 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22301 DAG.getConstant(4, dl, ExtVT));
22302 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22303 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22306 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22307 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22309 // r = VSELECT(r, shift(r, 2), a);
22310 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22311 DAG.getConstant(2, dl, ExtVT));
22312 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22313 DAG.getConstant(2, dl, ExtVT));
22314 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22315 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22318 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22319 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22321 // r = VSELECT(r, shift(r, 1), a);
22322 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22323 DAG.getConstant(1, dl, ExtVT));
22324 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22325 DAG.getConstant(1, dl, ExtVT));
22326 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22327 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22329 // Logical shift the result back to the lower byte, leaving a zero upper
22331 // meaning that we can safely pack with PACKUSWB.
22333 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22335 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22336 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22340 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22341 MVT ExtVT = MVT::v8i32;
22342 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22343 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22344 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22345 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22346 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22347 ALo = DAG.getBitcast(ExtVT, ALo);
22348 AHi = DAG.getBitcast(ExtVT, AHi);
22349 RLo = DAG.getBitcast(ExtVT, RLo);
22350 RHi = DAG.getBitcast(ExtVT, RHi);
22351 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22352 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22353 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22354 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22355 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22358 if (VT == MVT::v8i16) {
22359 unsigned ShiftOpcode = Op->getOpcode();
22361 // If we have a constant shift amount, the non-SSE41 path is best as
22362 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22363 bool UseSSE41 = Subtarget.hasSSE41() &&
22364 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22366 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22367 // On SSE41 targets we make use of the fact that VSELECT lowers
22368 // to PBLENDVB which selects bytes based just on the sign bit.
22370 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22371 V0 = DAG.getBitcast(ExtVT, V0);
22372 V1 = DAG.getBitcast(ExtVT, V1);
22373 Sel = DAG.getBitcast(ExtVT, Sel);
22374 return DAG.getBitcast(
22375 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
22377 // On pre-SSE41 targets we splat the sign bit - a negative value will
22378 // set all bits of the lanes to true and VSELECT uses that in
22379 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22381 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22382 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
22385 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22387 // On SSE41 targets we need to replicate the shift mask in both
22388 // bytes for PBLENDVB.
22391 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22392 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22394 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22397 // r = VSELECT(r, shift(r, 8), a);
22398 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22399 R = SignBitSelect(Amt, M, R);
22402 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22404 // r = VSELECT(r, shift(r, 4), a);
22405 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22406 R = SignBitSelect(Amt, M, R);
22409 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22411 // r = VSELECT(r, shift(r, 2), a);
22412 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22413 R = SignBitSelect(Amt, M, R);
22416 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22418 // return VSELECT(r, shift(r, 1), a);
22419 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22420 R = SignBitSelect(Amt, M, R);
22424 // Decompose 256-bit shifts into smaller 128-bit shifts.
22425 if (VT.is256BitVector())
22426 return Lower256IntArith(Op, DAG);
22431 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22432 SelectionDAG &DAG) {
22433 MVT VT = Op.getSimpleValueType();
22435 SDValue R = Op.getOperand(0);
22436 SDValue Amt = Op.getOperand(1);
22438 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22439 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22440 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22442 // XOP has 128-bit vector variable + immediate rotates.
22443 // +ve/-ve Amt = rotate left/right.
22445 // Split 256-bit integers.
22446 if (VT.is256BitVector())
22447 return Lower256IntArith(Op, DAG);
22449 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22451 // Attempt to rotate by immediate.
22452 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22453 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22454 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22455 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22456 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22457 DAG.getConstant(RotateAmt, DL, MVT::i8));
22461 // Use general rotate by variable (per-element).
22462 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22465 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22466 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22467 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22468 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22469 // has only one use.
22470 SDNode *N = Op.getNode();
22471 SDValue LHS = N->getOperand(0);
22472 SDValue RHS = N->getOperand(1);
22473 unsigned BaseOp = 0;
22474 X86::CondCode Cond;
22476 switch (Op.getOpcode()) {
22477 default: llvm_unreachable("Unknown ovf instruction!");
22479 // A subtract of one will be selected as a INC. Note that INC doesn't
22480 // set CF, so we can't do this for UADDO.
22481 if (isOneConstant(RHS)) {
22482 BaseOp = X86ISD::INC;
22483 Cond = X86::COND_O;
22486 BaseOp = X86ISD::ADD;
22487 Cond = X86::COND_O;
22490 BaseOp = X86ISD::ADD;
22491 Cond = X86::COND_B;
22494 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22495 // set CF, so we can't do this for USUBO.
22496 if (isOneConstant(RHS)) {
22497 BaseOp = X86ISD::DEC;
22498 Cond = X86::COND_O;
22501 BaseOp = X86ISD::SUB;
22502 Cond = X86::COND_O;
22505 BaseOp = X86ISD::SUB;
22506 Cond = X86::COND_B;
22509 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22510 Cond = X86::COND_O;
22512 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22513 if (N->getValueType(0) == MVT::i8) {
22514 BaseOp = X86ISD::UMUL8;
22515 Cond = X86::COND_O;
22518 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22520 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22522 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22524 if (N->getValueType(1) == MVT::i1)
22525 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22527 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22531 // Also sets EFLAGS.
22532 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22533 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22535 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22537 if (N->getValueType(1) == MVT::i1)
22538 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22540 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22543 /// Returns true if the operand type is exactly twice the native width, and
22544 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22545 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22546 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22547 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22548 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22551 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22552 else if (OpWidth == 128)
22553 return Subtarget.hasCmpxchg16b();
22558 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22559 return needsCmpXchgNb(SI->getValueOperand()->getType());
22562 // Note: this turns large loads into lock cmpxchg8b/16b.
22563 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22564 TargetLowering::AtomicExpansionKind
22565 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22566 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22567 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22568 : AtomicExpansionKind::None;
22571 TargetLowering::AtomicExpansionKind
22572 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22573 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22574 Type *MemType = AI->getType();
22576 // If the operand is too big, we must see if cmpxchg8/16b is available
22577 // and default to library calls otherwise.
22578 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22579 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22580 : AtomicExpansionKind::None;
22583 AtomicRMWInst::BinOp Op = AI->getOperation();
22586 llvm_unreachable("Unknown atomic operation");
22587 case AtomicRMWInst::Xchg:
22588 case AtomicRMWInst::Add:
22589 case AtomicRMWInst::Sub:
22590 // It's better to use xadd, xsub or xchg for these in all cases.
22591 return AtomicExpansionKind::None;
22592 case AtomicRMWInst::Or:
22593 case AtomicRMWInst::And:
22594 case AtomicRMWInst::Xor:
22595 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22596 // prefix to a normal instruction for these operations.
22597 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22598 : AtomicExpansionKind::None;
22599 case AtomicRMWInst::Nand:
22600 case AtomicRMWInst::Max:
22601 case AtomicRMWInst::Min:
22602 case AtomicRMWInst::UMax:
22603 case AtomicRMWInst::UMin:
22604 // These always require a non-trivial set of data operations on x86. We must
22605 // use a cmpxchg loop.
22606 return AtomicExpansionKind::CmpXChg;
22611 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22612 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22613 Type *MemType = AI->getType();
22614 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22615 // there is no benefit in turning such RMWs into loads, and it is actually
22616 // harmful as it introduces a mfence.
22617 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22620 auto Builder = IRBuilder<>(AI);
22621 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22622 auto SynchScope = AI->getSynchScope();
22623 // We must restrict the ordering to avoid generating loads with Release or
22624 // ReleaseAcquire orderings.
22625 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22626 auto Ptr = AI->getPointerOperand();
22628 // Before the load we need a fence. Here is an example lifted from
22629 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22632 // x.store(1, relaxed);
22633 // r1 = y.fetch_add(0, release);
22635 // y.fetch_add(42, acquire);
22636 // r2 = x.load(relaxed);
22637 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22638 // lowered to just a load without a fence. A mfence flushes the store buffer,
22639 // making the optimization clearly correct.
22640 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22641 // otherwise, we might be able to be more aggressive on relaxed idempotent
22642 // rmw. In practice, they do not look useful, so we don't try to be
22643 // especially clever.
22644 if (SynchScope == SingleThread)
22645 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22646 // the IR level, so we must wrap it in an intrinsic.
22649 if (!Subtarget.hasMFence())
22650 // FIXME: it might make sense to use a locked operation here but on a
22651 // different cache-line to prevent cache-line bouncing. In practice it
22652 // is probably a small win, and x86 processors without mfence are rare
22653 // enough that we do not bother.
22657 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22658 Builder.CreateCall(MFence, {});
22660 // Finally we can emit the atomic load.
22661 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22662 AI->getType()->getPrimitiveSizeInBits());
22663 Loaded->setAtomic(Order, SynchScope);
22664 AI->replaceAllUsesWith(Loaded);
22665 AI->eraseFromParent();
22669 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22670 SelectionDAG &DAG) {
22672 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22673 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22674 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22675 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22677 // The only fence that needs an instruction is a sequentially-consistent
22678 // cross-thread fence.
22679 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22680 FenceScope == CrossThread) {
22681 if (Subtarget.hasMFence())
22682 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22684 SDValue Chain = Op.getOperand(0);
22685 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22687 DAG.getRegister(X86::ESP, MVT::i32), // Base
22688 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22689 DAG.getRegister(0, MVT::i32), // Index
22690 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22691 DAG.getRegister(0, MVT::i32), // Segment.
22695 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22696 return SDValue(Res, 0);
22699 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22700 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22703 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22704 SelectionDAG &DAG) {
22705 MVT T = Op.getSimpleValueType();
22709 switch(T.SimpleTy) {
22710 default: llvm_unreachable("Invalid value type!");
22711 case MVT::i8: Reg = X86::AL; size = 1; break;
22712 case MVT::i16: Reg = X86::AX; size = 2; break;
22713 case MVT::i32: Reg = X86::EAX; size = 4; break;
22715 assert(Subtarget.is64Bit() && "Node not type legal!");
22716 Reg = X86::RAX; size = 8;
22719 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22720 Op.getOperand(2), SDValue());
22721 SDValue Ops[] = { cpIn.getValue(0),
22724 DAG.getTargetConstant(size, DL, MVT::i8),
22725 cpIn.getValue(1) };
22726 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22727 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22728 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22732 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22733 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22734 MVT::i32, cpOut.getValue(2));
22735 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22737 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22738 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22739 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22743 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22744 SelectionDAG &DAG) {
22745 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22746 MVT DstVT = Op.getSimpleValueType();
22748 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22749 SrcVT == MVT::i64) {
22750 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22751 if (DstVT != MVT::f64)
22752 // This conversion needs to be expanded.
22755 SDValue Op0 = Op->getOperand(0);
22756 SmallVector<SDValue, 16> Elts;
22760 if (SrcVT.isVector()) {
22761 NumElts = SrcVT.getVectorNumElements();
22762 SVT = SrcVT.getVectorElementType();
22764 // Widen the vector in input in the case of MVT::v2i32.
22765 // Example: from MVT::v2i32 to MVT::v4i32.
22766 for (unsigned i = 0, e = NumElts; i != e; ++i)
22767 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22768 DAG.getIntPtrConstant(i, dl)));
22770 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22771 "Unexpected source type in LowerBITCAST");
22772 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22773 DAG.getIntPtrConstant(0, dl)));
22774 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22775 DAG.getIntPtrConstant(1, dl)));
22779 // Explicitly mark the extra elements as Undef.
22780 Elts.append(NumElts, DAG.getUNDEF(SVT));
22782 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22783 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22784 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22785 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22786 DAG.getIntPtrConstant(0, dl));
22789 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22790 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22791 assert((DstVT == MVT::i64 ||
22792 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22793 "Unexpected custom BITCAST");
22794 // i64 <=> MMX conversions are Legal.
22795 if (SrcVT==MVT::i64 && DstVT.isVector())
22797 if (DstVT==MVT::i64 && SrcVT.isVector())
22799 // MMX <=> MMX conversions are Legal.
22800 if (SrcVT.isVector() && DstVT.isVector())
22802 // All other conversions need to be expanded.
22806 /// Compute the horizontal sum of bytes in V for the elements of VT.
22808 /// Requires V to be a byte vector and VT to be an integer vector type with
22809 /// wider elements than V's type. The width of the elements of VT determines
22810 /// how many bytes of V are summed horizontally to produce each element of the
22812 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22813 const X86Subtarget &Subtarget,
22814 SelectionDAG &DAG) {
22816 MVT ByteVecVT = V.getSimpleValueType();
22817 MVT EltVT = VT.getVectorElementType();
22818 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22819 "Expected value to have byte element type.");
22820 assert(EltVT != MVT::i8 &&
22821 "Horizontal byte sum only makes sense for wider elements!");
22822 unsigned VecSize = VT.getSizeInBits();
22823 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22825 // PSADBW instruction horizontally add all bytes and leave the result in i64
22826 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22827 if (EltVT == MVT::i64) {
22828 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22829 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22830 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22831 return DAG.getBitcast(VT, V);
22834 if (EltVT == MVT::i32) {
22835 // We unpack the low half and high half into i32s interleaved with zeros so
22836 // that we can use PSADBW to horizontally sum them. The most useful part of
22837 // this is that it lines up the results of two PSADBW instructions to be
22838 // two v2i64 vectors which concatenated are the 4 population counts. We can
22839 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22840 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22841 SDValue V32 = DAG.getBitcast(VT, V);
22842 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22843 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22845 // Do the horizontal sums into two v2i64s.
22846 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22847 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22848 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22849 DAG.getBitcast(ByteVecVT, Low), Zeros);
22850 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22851 DAG.getBitcast(ByteVecVT, High), Zeros);
22853 // Merge them together.
22854 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22855 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22856 DAG.getBitcast(ShortVecVT, Low),
22857 DAG.getBitcast(ShortVecVT, High));
22859 return DAG.getBitcast(VT, V);
22862 // The only element type left is i16.
22863 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22865 // To obtain pop count for each i16 element starting from the pop count for
22866 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22867 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22868 // directly supported.
22869 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22870 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22871 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22872 DAG.getBitcast(ByteVecVT, V));
22873 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22876 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22877 const X86Subtarget &Subtarget,
22878 SelectionDAG &DAG) {
22879 MVT VT = Op.getSimpleValueType();
22880 MVT EltVT = VT.getVectorElementType();
22881 unsigned VecSize = VT.getSizeInBits();
22883 // Implement a lookup table in register by using an algorithm based on:
22884 // http://wm.ite.pl/articles/sse-popcount.html
22886 // The general idea is that every lower byte nibble in the input vector is an
22887 // index into a in-register pre-computed pop count table. We then split up the
22888 // input vector in two new ones: (1) a vector with only the shifted-right
22889 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22890 // masked out higher ones) for each byte. PSHUFB is used separately with both
22891 // to index the in-register table. Next, both are added and the result is a
22892 // i8 vector where each element contains the pop count for input byte.
22894 // To obtain the pop count for elements != i8, we follow up with the same
22895 // approach and use additional tricks as described below.
22897 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22898 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22899 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22900 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22902 int NumByteElts = VecSize / 8;
22903 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22904 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22905 SmallVector<SDValue, 64> LUTVec;
22906 for (int i = 0; i < NumByteElts; ++i)
22907 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22908 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22909 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22912 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22913 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22916 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22918 // The input vector is used as the shuffle mask that index elements into the
22919 // LUT. After counting low and high nibbles, add the vector to obtain the
22920 // final pop count per i8 element.
22921 SDValue HighPopCnt =
22922 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22923 SDValue LowPopCnt =
22924 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22925 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22927 if (EltVT == MVT::i8)
22930 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22933 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22934 const X86Subtarget &Subtarget,
22935 SelectionDAG &DAG) {
22936 MVT VT = Op.getSimpleValueType();
22937 assert(VT.is128BitVector() &&
22938 "Only 128-bit vector bitmath lowering supported.");
22940 int VecSize = VT.getSizeInBits();
22941 MVT EltVT = VT.getVectorElementType();
22942 int Len = EltVT.getSizeInBits();
22944 // This is the vectorized version of the "best" algorithm from
22945 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22946 // with a minor tweak to use a series of adds + shifts instead of vector
22947 // multiplications. Implemented for all integer vector types. We only use
22948 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22949 // much faster, even faster than using native popcnt instructions.
22951 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
22952 MVT VT = V.getSimpleValueType();
22953 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
22954 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
22956 auto GetMask = [&](SDValue V, APInt Mask) {
22957 MVT VT = V.getSimpleValueType();
22958 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
22959 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
22962 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
22963 // x86, so set the SRL type to have elements at least i16 wide. This is
22964 // correct because all of our SRLs are followed immediately by a mask anyways
22965 // that handles any bits that sneak into the high bits of the byte elements.
22966 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
22970 // v = v - ((v >> 1) & 0x55555555...)
22972 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
22973 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
22974 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
22976 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
22977 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
22978 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
22979 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
22980 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
22982 // v = (v + (v >> 4)) & 0x0F0F0F0F...
22983 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
22984 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
22985 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
22987 // At this point, V contains the byte-wise population count, and we are
22988 // merely doing a horizontal sum if necessary to get the wider element
22990 if (EltVT == MVT::i8)
22993 return LowerHorizontalByteSum(
22994 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
22998 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
22999 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23000 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23001 SelectionDAG &DAG) {
23002 MVT VT = Op.getSimpleValueType();
23003 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23004 "Unknown CTPOP type to handle");
23005 SDLoc DL(Op.getNode());
23006 SDValue Op0 = Op.getOperand(0);
23008 if (!Subtarget.hasSSSE3()) {
23009 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23010 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23011 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23014 // Decompose 256-bit ops into smaller 128-bit ops.
23015 if (VT.is256BitVector() && !Subtarget.hasInt256())
23016 return Lower256IntUnary(Op, DAG);
23018 // Decompose 512-bit ops into smaller 256-bit ops.
23019 if (VT.is512BitVector() && !Subtarget.hasBWI())
23020 return Lower512IntUnary(Op, DAG);
23022 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23025 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23026 SelectionDAG &DAG) {
23027 assert(Op.getSimpleValueType().isVector() &&
23028 "We only do custom lowering for vector population count.");
23029 return LowerVectorCTPOP(Op, Subtarget, DAG);
23032 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23033 MVT VT = Op.getSimpleValueType();
23034 SDValue In = Op.getOperand(0);
23037 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23038 // perform the BITREVERSE.
23039 if (!VT.isVector()) {
23040 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23041 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23042 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23043 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23044 DAG.getIntPtrConstant(0, DL));
23047 int NumElts = VT.getVectorNumElements();
23048 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23050 // Decompose 256-bit ops into smaller 128-bit ops.
23051 if (VT.is256BitVector())
23052 return Lower256IntUnary(Op, DAG);
23054 assert(VT.is128BitVector() &&
23055 "Only 128-bit vector bitreverse lowering supported.");
23057 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23058 // perform the BSWAP in the shuffle.
23059 // Its best to shuffle using the second operand as this will implicitly allow
23060 // memory folding for multiple vectors.
23061 SmallVector<SDValue, 16> MaskElts;
23062 for (int i = 0; i != NumElts; ++i) {
23063 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23064 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23065 int PermuteByte = SourceByte | (2 << 5);
23066 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23070 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23071 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23072 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23074 return DAG.getBitcast(VT, Res);
23077 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23078 SelectionDAG &DAG) {
23079 if (Subtarget.hasXOP())
23080 return LowerBITREVERSE_XOP(Op, DAG);
23082 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23084 MVT VT = Op.getSimpleValueType();
23085 SDValue In = Op.getOperand(0);
23088 unsigned NumElts = VT.getVectorNumElements();
23089 assert(VT.getScalarType() == MVT::i8 &&
23090 "Only byte vector BITREVERSE supported");
23092 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23093 if (VT.is256BitVector() && !Subtarget.hasInt256())
23094 return Lower256IntUnary(Op, DAG);
23096 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23097 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23098 // 0-15 value (moved to the other nibble).
23099 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23100 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23101 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23103 const int LoLUT[16] = {
23104 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23105 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23106 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23107 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23108 const int HiLUT[16] = {
23109 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23110 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23111 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23112 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23114 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23115 for (unsigned i = 0; i < NumElts; ++i) {
23116 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23117 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23120 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23121 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23122 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23123 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23124 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23127 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23128 unsigned NewOpc = 0;
23129 switch (N->getOpcode()) {
23130 case ISD::ATOMIC_LOAD_ADD:
23131 NewOpc = X86ISD::LADD;
23133 case ISD::ATOMIC_LOAD_SUB:
23134 NewOpc = X86ISD::LSUB;
23136 case ISD::ATOMIC_LOAD_OR:
23137 NewOpc = X86ISD::LOR;
23139 case ISD::ATOMIC_LOAD_XOR:
23140 NewOpc = X86ISD::LXOR;
23142 case ISD::ATOMIC_LOAD_AND:
23143 NewOpc = X86ISD::LAND;
23146 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23149 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23150 return DAG.getMemIntrinsicNode(
23151 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23152 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23153 /*MemVT=*/N->getSimpleValueType(0), MMO);
23156 /// Lower atomic_load_ops into LOCK-prefixed operations.
23157 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23158 const X86Subtarget &Subtarget) {
23159 SDValue Chain = N->getOperand(0);
23160 SDValue LHS = N->getOperand(1);
23161 SDValue RHS = N->getOperand(2);
23162 unsigned Opc = N->getOpcode();
23163 MVT VT = N->getSimpleValueType(0);
23166 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23167 // can only be lowered when the result is unused. They should have already
23168 // been transformed into a cmpxchg loop in AtomicExpand.
23169 if (N->hasAnyUseOfValue(0)) {
23170 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23171 // select LXADD if LOCK_SUB can't be selected.
23172 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23173 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23174 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23175 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23176 RHS, AN->getMemOperand());
23178 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23179 "Used AtomicRMW ops other than Add should have been expanded!");
23183 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23184 // RAUW the chain, but don't worry about the result, as it's unused.
23185 assert(!N->hasAnyUseOfValue(0));
23186 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23190 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23191 SDNode *Node = Op.getNode();
23193 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23195 // Convert seq_cst store -> xchg
23196 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23197 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23198 // (The only way to get a 16-byte store is cmpxchg16b)
23199 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23200 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23201 AtomicOrdering::SequentiallyConsistent ||
23202 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23203 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23204 cast<AtomicSDNode>(Node)->getMemoryVT(),
23205 Node->getOperand(0),
23206 Node->getOperand(1), Node->getOperand(2),
23207 cast<AtomicSDNode>(Node)->getMemOperand());
23208 return Swap.getValue(1);
23210 // Other atomic stores have a simple pattern.
23214 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
23215 MVT VT = Op.getNode()->getSimpleValueType(0);
23217 // Let legalize expand this if it isn't a legal type yet.
23218 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23221 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23224 bool ExtraOp = false;
23225 switch (Op.getOpcode()) {
23226 default: llvm_unreachable("Invalid code");
23227 case ISD::ADDC: Opc = X86ISD::ADD; break;
23228 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
23229 case ISD::SUBC: Opc = X86ISD::SUB; break;
23230 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
23234 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23236 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23237 Op.getOperand(1), Op.getOperand(2));
23240 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23241 SDNode *N = Op.getNode();
23242 MVT VT = N->getSimpleValueType(0);
23244 // Let legalize expand this if it isn't a legal type yet.
23245 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23248 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23251 // Set the carry flag.
23252 SDValue Carry = Op.getOperand(2);
23253 EVT CarryVT = Carry.getValueType();
23254 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23255 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23256 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23258 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23259 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23260 Op.getOperand(1), Carry.getValue(1));
23262 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23263 if (N->getValueType(1) == MVT::i1)
23264 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23266 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23269 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23270 SelectionDAG &DAG) {
23271 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23273 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23274 // which returns the values as { float, float } (in XMM0) or
23275 // { double, double } (which is returned in XMM0, XMM1).
23277 SDValue Arg = Op.getOperand(0);
23278 EVT ArgVT = Arg.getValueType();
23279 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23281 TargetLowering::ArgListTy Args;
23282 TargetLowering::ArgListEntry Entry;
23286 Entry.IsSExt = false;
23287 Entry.IsZExt = false;
23288 Args.push_back(Entry);
23290 bool isF64 = ArgVT == MVT::f64;
23291 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23292 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23293 // the results are returned via SRet in memory.
23294 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23297 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23299 Type *RetTy = isF64
23300 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
23301 : (Type*)VectorType::get(ArgTy, 4);
23303 TargetLowering::CallLoweringInfo CLI(DAG);
23304 CLI.setDebugLoc(dl)
23305 .setChain(DAG.getEntryNode())
23306 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23308 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23311 // Returned in xmm0 and xmm1.
23312 return CallResult.first;
23314 // Returned in bits 0:31 and 32:64 xmm0.
23315 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23316 CallResult.first, DAG.getIntPtrConstant(0, dl));
23317 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23318 CallResult.first, DAG.getIntPtrConstant(1, dl));
23319 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23320 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23323 /// Widen a vector input to a vector of NVT. The
23324 /// input vector must have the same element type as NVT.
23325 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23326 bool FillWithZeroes = false) {
23327 // Check if InOp already has the right width.
23328 MVT InVT = InOp.getSimpleValueType();
23332 if (InOp.isUndef())
23333 return DAG.getUNDEF(NVT);
23335 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23336 "input and widen element type must match");
23338 unsigned InNumElts = InVT.getVectorNumElements();
23339 unsigned WidenNumElts = NVT.getVectorNumElements();
23340 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23341 "Unexpected request for vector widening");
23343 EVT EltVT = NVT.getVectorElementType();
23346 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23347 InOp.getNumOperands() == 2) {
23348 SDValue N1 = InOp.getOperand(1);
23349 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23351 InOp = InOp.getOperand(0);
23352 InVT = InOp.getSimpleValueType();
23353 InNumElts = InVT.getVectorNumElements();
23356 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23357 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23358 SmallVector<SDValue, 16> Ops;
23359 for (unsigned i = 0; i < InNumElts; ++i)
23360 Ops.push_back(InOp.getOperand(i));
23362 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23363 DAG.getUNDEF(EltVT);
23364 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23365 Ops.push_back(FillVal);
23366 return DAG.getBuildVector(NVT, dl, Ops);
23368 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23370 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23371 InOp, DAG.getIntPtrConstant(0, dl));
23374 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23375 SelectionDAG &DAG) {
23376 assert(Subtarget.hasAVX512() &&
23377 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23379 // X86 scatter kills mask register, so its type should be added to
23380 // the list of return values.
23381 // If the "scatter" has 2 return values, it is already handled.
23382 if (Op.getNode()->getNumValues() == 2)
23385 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23386 SDValue Src = N->getValue();
23387 MVT VT = Src.getSimpleValueType();
23388 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23391 SDValue NewScatter;
23392 SDValue Index = N->getIndex();
23393 SDValue Mask = N->getMask();
23394 SDValue Chain = N->getChain();
23395 SDValue BasePtr = N->getBasePtr();
23396 MVT MemVT = N->getMemoryVT().getSimpleVT();
23397 MVT IndexVT = Index.getSimpleValueType();
23398 MVT MaskVT = Mask.getSimpleValueType();
23400 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23401 // The v2i32 value was promoted to v2i64.
23402 // Now we "redo" the type legalizer's work and widen the original
23403 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23405 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23406 "Unexpected memory type");
23407 int ShuffleMask[] = {0, 2, -1, -1};
23408 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23409 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23410 // Now we have 4 elements instead of 2.
23411 // Expand the index.
23412 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23413 Index = ExtendToType(Index, NewIndexVT, DAG);
23415 // Expand the mask with zeroes
23416 // Mask may be <2 x i64> or <2 x i1> at this moment
23417 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23418 "Unexpected mask type");
23419 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23420 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23424 unsigned NumElts = VT.getVectorNumElements();
23425 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23426 !Index.getSimpleValueType().is512BitVector()) {
23427 // AVX512F supports only 512-bit vectors. Or data or index should
23428 // be 512 bit wide. If now the both index and data are 256-bit, but
23429 // the vector contains 8 elements, we just sign-extend the index
23430 if (IndexVT == MVT::v8i32)
23431 // Just extend index
23432 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23434 // The minimal number of elts in scatter is 8
23437 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23438 // Use original index here, do not modify the index twice
23439 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23440 if (IndexVT.getScalarType() == MVT::i32)
23441 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23444 // At this point we have promoted mask operand
23445 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23446 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23447 // Use the original mask here, do not modify the mask twice
23448 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23450 // The value that should be stored
23451 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23452 Src = ExtendToType(Src, NewVT, DAG);
23455 // If the mask is "wide" at this point - truncate it to i1 vector
23456 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23457 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23459 // The mask is killed by scatter, add it to the values
23460 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23461 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23462 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23463 N->getMemOperand());
23464 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23465 return SDValue(NewScatter.getNode(), 1);
23468 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23469 SelectionDAG &DAG) {
23471 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23472 MVT VT = Op.getSimpleValueType();
23473 MVT ScalarVT = VT.getScalarType();
23474 SDValue Mask = N->getMask();
23477 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23478 "Expanding masked load is supported on AVX-512 target only!");
23480 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23481 "Expanding masked load is supported for 32 and 64-bit types only!");
23483 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23484 // VLX. These types for exp-loads are handled here.
23485 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23488 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23489 "Cannot lower masked load op.");
23491 assert((ScalarVT.getSizeInBits() >= 32 ||
23492 (Subtarget.hasBWI() &&
23493 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23494 "Unsupported masked load op.");
23496 // This operation is legal for targets with VLX, but without
23497 // VLX the vector should be widened to 512 bit
23498 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23499 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23500 SDValue Src0 = N->getSrc0();
23501 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23503 // Mask element has to be i1.
23504 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23505 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23506 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23508 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23510 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23511 if (MaskEltTy != MVT::i1)
23512 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23513 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23514 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23515 N->getBasePtr(), Mask, Src0,
23516 N->getMemoryVT(), N->getMemOperand(),
23517 N->getExtensionType(),
23518 N->isExpandingLoad());
23520 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23521 NewLoad.getValue(0),
23522 DAG.getIntPtrConstant(0, dl));
23523 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23524 return DAG.getMergeValues(RetOps, dl);
23527 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23528 SelectionDAG &DAG) {
23529 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23530 SDValue DataToStore = N->getValue();
23531 MVT VT = DataToStore.getSimpleValueType();
23532 MVT ScalarVT = VT.getScalarType();
23533 SDValue Mask = N->getMask();
23536 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23537 "Expanding masked load is supported on AVX-512 target only!");
23539 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23540 "Expanding masked load is supported for 32 and 64-bit types only!");
23542 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23543 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23546 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23547 "Cannot lower masked store op.");
23549 assert((ScalarVT.getSizeInBits() >= 32 ||
23550 (Subtarget.hasBWI() &&
23551 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23552 "Unsupported masked store op.");
23554 // This operation is legal for targets with VLX, but without
23555 // VLX the vector should be widened to 512 bit
23556 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23557 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23559 // Mask element has to be i1.
23560 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23561 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23562 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23564 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23566 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23567 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23568 if (MaskEltTy != MVT::i1)
23569 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23570 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23571 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23572 Mask, N->getMemoryVT(), N->getMemOperand(),
23573 N->isTruncatingStore(), N->isCompressingStore());
23576 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23577 SelectionDAG &DAG) {
23578 assert(Subtarget.hasAVX512() &&
23579 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23581 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23583 MVT VT = Op.getSimpleValueType();
23584 SDValue Index = N->getIndex();
23585 SDValue Mask = N->getMask();
23586 SDValue Src0 = N->getValue();
23587 MVT IndexVT = Index.getSimpleValueType();
23588 MVT MaskVT = Mask.getSimpleValueType();
23590 unsigned NumElts = VT.getVectorNumElements();
23591 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23593 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23594 !Index.getSimpleValueType().is512BitVector()) {
23595 // AVX512F supports only 512-bit vectors. Or data or index should
23596 // be 512 bit wide. If now the both index and data are 256-bit, but
23597 // the vector contains 8 elements, we just sign-extend the index
23598 if (NumElts == 8) {
23599 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23600 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23601 N->getOperand(3), Index };
23602 DAG.UpdateNodeOperands(N, Ops);
23606 // Minimal number of elements in Gather
23609 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23610 Index = ExtendToType(Index, NewIndexVT, DAG);
23611 if (IndexVT.getScalarType() == MVT::i32)
23612 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23615 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23616 // At this point we have promoted mask operand
23617 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23618 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23619 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23620 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23622 // The pass-through value
23623 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23624 Src0 = ExtendToType(Src0, NewVT, DAG);
23626 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23627 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23628 N->getMemoryVT(), dl, Ops,
23629 N->getMemOperand());
23630 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23631 NewGather.getValue(0),
23632 DAG.getIntPtrConstant(0, dl));
23633 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23634 return DAG.getMergeValues(RetOps, dl);
23639 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23640 SelectionDAG &DAG) const {
23641 // TODO: Eventually, the lowering of these nodes should be informed by or
23642 // deferred to the GC strategy for the function in which they appear. For
23643 // now, however, they must be lowered to something. Since they are logically
23644 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23645 // require special handling for these nodes), lower them as literal NOOPs for
23647 SmallVector<SDValue, 2> Ops;
23649 Ops.push_back(Op.getOperand(0));
23650 if (Op->getGluedNode())
23651 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23654 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23655 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23660 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23661 SelectionDAG &DAG) const {
23662 // TODO: Eventually, the lowering of these nodes should be informed by or
23663 // deferred to the GC strategy for the function in which they appear. For
23664 // now, however, they must be lowered to something. Since they are logically
23665 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23666 // require special handling for these nodes), lower them as literal NOOPs for
23668 SmallVector<SDValue, 2> Ops;
23670 Ops.push_back(Op.getOperand(0));
23671 if (Op->getGluedNode())
23672 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23675 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23676 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23681 /// Provide custom lowering hooks for some operations.
23682 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23683 switch (Op.getOpcode()) {
23684 default: llvm_unreachable("Should not custom lower this!");
23685 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23686 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23687 return LowerCMP_SWAP(Op, Subtarget, DAG);
23688 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23689 case ISD::ATOMIC_LOAD_ADD:
23690 case ISD::ATOMIC_LOAD_SUB:
23691 case ISD::ATOMIC_LOAD_OR:
23692 case ISD::ATOMIC_LOAD_XOR:
23693 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23694 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23695 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23696 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23697 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23698 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23699 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23700 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23701 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23702 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23703 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23704 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23705 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23706 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23707 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23708 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23709 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23710 case ISD::SHL_PARTS:
23711 case ISD::SRA_PARTS:
23712 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23713 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23714 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23715 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23716 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23717 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23718 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23719 case ISD::ZERO_EXTEND_VECTOR_INREG:
23720 case ISD::SIGN_EXTEND_VECTOR_INREG:
23721 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23722 case ISD::FP_TO_SINT:
23723 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
23724 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23725 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23727 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23728 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23729 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23730 case ISD::SETCC: return LowerSETCC(Op, DAG);
23731 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23732 case ISD::SELECT: return LowerSELECT(Op, DAG);
23733 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23734 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23735 case ISD::VASTART: return LowerVASTART(Op, DAG);
23736 case ISD::VAARG: return LowerVAARG(Op, DAG);
23737 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23738 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23739 case ISD::INTRINSIC_VOID:
23740 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23741 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23742 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23743 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23744 case ISD::FRAME_TO_ARGS_OFFSET:
23745 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23746 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23747 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23748 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23749 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23750 case ISD::EH_SJLJ_SETUP_DISPATCH:
23751 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23752 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23753 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23754 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23756 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23758 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23759 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23761 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23762 case ISD::UMUL_LOHI:
23763 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23764 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23767 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23773 case ISD::UMULO: return LowerXALUO(Op, DAG);
23774 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23775 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23779 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23780 case ISD::ADDCARRY:
23781 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
23783 case ISD::SUB: return LowerADD_SUB(Op, DAG);
23787 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23788 case ISD::ABS: return LowerABS(Op, DAG);
23789 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23790 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23791 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23792 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23793 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23794 case ISD::GC_TRANSITION_START:
23795 return LowerGC_TRANSITION_START(Op, DAG);
23796 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23797 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23801 /// Places new result values for the node in Results (their number
23802 /// and types must exactly match those of the original return values of
23803 /// the node), or leaves Results empty, which indicates that the node is not
23804 /// to be custom lowered after all.
23805 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23806 SmallVectorImpl<SDValue> &Results,
23807 SelectionDAG &DAG) const {
23808 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23810 if (!Res.getNode())
23813 assert((N->getNumValues() <= Res->getNumValues()) &&
23814 "Lowering returned the wrong number of results!");
23816 // Places new result values base on N result number.
23817 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23818 // than original node, chain should be dropped(last value).
23819 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23820 Results.push_back(Res.getValue(I));
23823 /// Replace a node with an illegal result type with a new node built out of
23825 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23826 SmallVectorImpl<SDValue>&Results,
23827 SelectionDAG &DAG) const {
23829 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23830 switch (N->getOpcode()) {
23832 llvm_unreachable("Do not know how to custom type legalize this operation!");
23833 case X86ISD::AVG: {
23834 // Legalize types for X86ISD::AVG by expanding vectors.
23835 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23837 auto InVT = N->getValueType(0);
23838 auto InVTSize = InVT.getSizeInBits();
23839 const unsigned RegSize =
23840 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23841 assert((Subtarget.hasBWI() || RegSize < 512) &&
23842 "512-bit vector requires AVX512BW");
23843 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23844 "256-bit vector requires AVX2");
23846 auto ElemVT = InVT.getVectorElementType();
23847 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23848 RegSize / ElemVT.getSizeInBits());
23849 assert(RegSize % InVT.getSizeInBits() == 0);
23850 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23852 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23853 Ops[0] = N->getOperand(0);
23854 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23855 Ops[0] = N->getOperand(1);
23856 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23858 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23859 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23860 DAG.getIntPtrConstant(0, dl)));
23863 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23864 case X86ISD::FMINC:
23866 case X86ISD::FMAXC:
23867 case X86ISD::FMAX: {
23868 EVT VT = N->getValueType(0);
23869 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23870 SDValue UNDEF = DAG.getUNDEF(VT);
23871 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23872 N->getOperand(0), UNDEF);
23873 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23874 N->getOperand(1), UNDEF);
23875 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23883 case ISD::UDIVREM: {
23884 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23885 Results.push_back(V);
23888 case ISD::FP_TO_SINT:
23889 case ISD::FP_TO_UINT: {
23890 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23892 if (N->getValueType(0) == MVT::v2i32) {
23893 assert((IsSigned || Subtarget.hasAVX512()) &&
23894 "Can only handle signed conversion without AVX512");
23895 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23896 SDValue Src = N->getOperand(0);
23897 if (Src.getValueType() == MVT::v2f64) {
23898 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23899 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23900 : X86ISD::CVTTP2UI,
23901 dl, MVT::v4i32, Src);
23902 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23903 Results.push_back(Res);
23906 if (Src.getValueType() == MVT::v2f32) {
23907 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23908 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23909 DAG.getUNDEF(MVT::v2f32));
23910 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23911 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23912 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23913 Results.push_back(Res);
23917 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23918 // so early out here.
23922 std::pair<SDValue,SDValue> Vals =
23923 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23924 SDValue FIST = Vals.first, StackSlot = Vals.second;
23925 if (FIST.getNode()) {
23926 EVT VT = N->getValueType(0);
23927 // Return a load from the stack slot.
23928 if (StackSlot.getNode())
23930 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23932 Results.push_back(FIST);
23936 case ISD::SINT_TO_FP: {
23937 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23938 SDValue Src = N->getOperand(0);
23939 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23941 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23944 case ISD::UINT_TO_FP: {
23945 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23946 EVT VT = N->getValueType(0);
23947 if (VT != MVT::v2f32)
23949 SDValue Src = N->getOperand(0);
23950 EVT SrcVT = Src.getValueType();
23951 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23952 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23955 if (SrcVT != MVT::v2i32)
23957 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23959 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23960 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23961 DAG.getBitcast(MVT::v2i64, VBias));
23962 Or = DAG.getBitcast(MVT::v2f64, Or);
23963 // TODO: Are there any fast-math-flags to propagate here?
23964 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23965 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23968 case ISD::FP_ROUND: {
23969 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23971 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23972 Results.push_back(V);
23975 case ISD::FP_EXTEND: {
23976 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23977 // No other ValueType for FP_EXTEND should reach this point.
23978 assert(N->getValueType(0) == MVT::v2f32 &&
23979 "Do not know how to legalize this Node");
23982 case ISD::INTRINSIC_W_CHAIN: {
23983 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
23985 default : llvm_unreachable("Do not know how to custom type "
23986 "legalize this intrinsic operation!");
23987 case Intrinsic::x86_rdtsc:
23988 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23990 case Intrinsic::x86_rdtscp:
23991 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
23993 case Intrinsic::x86_rdpmc:
23994 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
23996 case Intrinsic::x86_xgetbv:
23997 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24000 case ISD::INTRINSIC_WO_CHAIN: {
24001 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24002 Results.push_back(V);
24005 case ISD::READCYCLECOUNTER: {
24006 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24009 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24010 EVT T = N->getValueType(0);
24011 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24012 bool Regs64bit = T == MVT::i128;
24013 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24014 SDValue cpInL, cpInH;
24015 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24016 DAG.getConstant(0, dl, HalfT));
24017 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24018 DAG.getConstant(1, dl, HalfT));
24019 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24020 Regs64bit ? X86::RAX : X86::EAX,
24022 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24023 Regs64bit ? X86::RDX : X86::EDX,
24024 cpInH, cpInL.getValue(1));
24025 SDValue swapInL, swapInH;
24026 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24027 DAG.getConstant(0, dl, HalfT));
24028 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24029 DAG.getConstant(1, dl, HalfT));
24031 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24032 swapInH, cpInH.getValue(1));
24033 // If the current function needs the base pointer, RBX,
24034 // we shouldn't use cmpxchg directly.
24035 // Indeed the lowering of that instruction will clobber
24036 // that register and since RBX will be a reserved register
24037 // the register allocator will not make sure its value will
24038 // be properly saved and restored around this live-range.
24039 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24041 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24042 unsigned BasePtr = TRI->getBaseRegister();
24043 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24044 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24045 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24046 // ISel prefers the LCMPXCHG64 variant.
24047 // If that assert breaks, that means it is not the case anymore,
24048 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24049 // not just EBX. This is a matter of accepting i64 input for that
24050 // pseudo, and restoring into the register of the right wide
24051 // in expand pseudo. Everything else should just work.
24052 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24053 "Saving only half of the RBX");
24054 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24055 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24056 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24057 Regs64bit ? X86::RBX : X86::EBX,
24058 HalfT, swapInH.getValue(1));
24059 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24061 /*Glue*/ RBXSave.getValue(2)};
24062 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24065 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24066 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24067 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24068 swapInH.getValue(1));
24069 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24070 swapInL.getValue(1)};
24071 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24073 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24074 Regs64bit ? X86::RAX : X86::EAX,
24075 HalfT, Result.getValue(1));
24076 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24077 Regs64bit ? X86::RDX : X86::EDX,
24078 HalfT, cpOutL.getValue(2));
24079 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24081 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24082 MVT::i32, cpOutH.getValue(2));
24083 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24084 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24086 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24087 Results.push_back(Success);
24088 Results.push_back(EFLAGS.getValue(1));
24091 case ISD::ATOMIC_SWAP:
24092 case ISD::ATOMIC_LOAD_ADD:
24093 case ISD::ATOMIC_LOAD_SUB:
24094 case ISD::ATOMIC_LOAD_AND:
24095 case ISD::ATOMIC_LOAD_OR:
24096 case ISD::ATOMIC_LOAD_XOR:
24097 case ISD::ATOMIC_LOAD_NAND:
24098 case ISD::ATOMIC_LOAD_MIN:
24099 case ISD::ATOMIC_LOAD_MAX:
24100 case ISD::ATOMIC_LOAD_UMIN:
24101 case ISD::ATOMIC_LOAD_UMAX:
24102 case ISD::ATOMIC_LOAD: {
24103 // Delegate to generic TypeLegalization. Situations we can really handle
24104 // should have already been dealt with by AtomicExpandPass.cpp.
24107 case ISD::BITCAST: {
24108 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24109 EVT DstVT = N->getValueType(0);
24110 EVT SrcVT = N->getOperand(0)->getValueType(0);
24112 if (SrcVT != MVT::f64 ||
24113 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24116 unsigned NumElts = DstVT.getVectorNumElements();
24117 EVT SVT = DstVT.getVectorElementType();
24118 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24119 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24120 MVT::v2f64, N->getOperand(0));
24121 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24123 if (ExperimentalVectorWideningLegalization) {
24124 // If we are legalizing vectors by widening, we already have the desired
24125 // legal vector type, just return it.
24126 Results.push_back(ToVecInt);
24130 SmallVector<SDValue, 8> Elts;
24131 for (unsigned i = 0, e = NumElts; i != e; ++i)
24132 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24133 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24135 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24140 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24141 switch ((X86ISD::NodeType)Opcode) {
24142 case X86ISD::FIRST_NUMBER: break;
24143 case X86ISD::BSF: return "X86ISD::BSF";
24144 case X86ISD::BSR: return "X86ISD::BSR";
24145 case X86ISD::SHLD: return "X86ISD::SHLD";
24146 case X86ISD::SHRD: return "X86ISD::SHRD";
24147 case X86ISD::FAND: return "X86ISD::FAND";
24148 case X86ISD::FANDN: return "X86ISD::FANDN";
24149 case X86ISD::FOR: return "X86ISD::FOR";
24150 case X86ISD::FXOR: return "X86ISD::FXOR";
24151 case X86ISD::FILD: return "X86ISD::FILD";
24152 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24153 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24154 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24155 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24156 case X86ISD::FLD: return "X86ISD::FLD";
24157 case X86ISD::FST: return "X86ISD::FST";
24158 case X86ISD::CALL: return "X86ISD::CALL";
24159 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24160 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24161 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24162 case X86ISD::BT: return "X86ISD::BT";
24163 case X86ISD::CMP: return "X86ISD::CMP";
24164 case X86ISD::COMI: return "X86ISD::COMI";
24165 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24166 case X86ISD::CMPM: return "X86ISD::CMPM";
24167 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24168 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24169 case X86ISD::SETCC: return "X86ISD::SETCC";
24170 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24171 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24172 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24173 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24174 case X86ISD::CMOV: return "X86ISD::CMOV";
24175 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24176 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24177 case X86ISD::IRET: return "X86ISD::IRET";
24178 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24179 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24180 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24181 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24182 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24183 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24184 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24185 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24186 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24187 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24188 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24189 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24190 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24191 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24192 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24193 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24194 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24195 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24196 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24197 case X86ISD::HADD: return "X86ISD::HADD";
24198 case X86ISD::HSUB: return "X86ISD::HSUB";
24199 case X86ISD::FHADD: return "X86ISD::FHADD";
24200 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24201 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24202 case X86ISD::FMAX: return "X86ISD::FMAX";
24203 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24204 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24205 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24206 case X86ISD::FMIN: return "X86ISD::FMIN";
24207 case X86ISD::FMINS: return "X86ISD::FMINS";
24208 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24209 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24210 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24211 case X86ISD::FMINC: return "X86ISD::FMINC";
24212 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24213 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24214 case X86ISD::FRCP: return "X86ISD::FRCP";
24215 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24216 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24217 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24218 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24219 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24220 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24221 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24222 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24223 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24224 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24225 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24226 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24227 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24228 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24229 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24230 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24231 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24232 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24233 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24234 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24235 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24236 case X86ISD::LADD: return "X86ISD::LADD";
24237 case X86ISD::LSUB: return "X86ISD::LSUB";
24238 case X86ISD::LOR: return "X86ISD::LOR";
24239 case X86ISD::LXOR: return "X86ISD::LXOR";
24240 case X86ISD::LAND: return "X86ISD::LAND";
24241 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24242 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24243 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24244 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24245 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24246 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24247 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24248 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24249 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24250 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24251 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24252 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24253 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24254 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24255 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24256 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24257 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24258 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24259 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24260 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24261 case X86ISD::VSHL: return "X86ISD::VSHL";
24262 case X86ISD::VSRL: return "X86ISD::VSRL";
24263 case X86ISD::VSRA: return "X86ISD::VSRA";
24264 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24265 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24266 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24267 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24268 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24269 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24270 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24271 case X86ISD::CMPP: return "X86ISD::CMPP";
24272 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24273 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24274 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24275 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24276 case X86ISD::ADD: return "X86ISD::ADD";
24277 case X86ISD::SUB: return "X86ISD::SUB";
24278 case X86ISD::ADC: return "X86ISD::ADC";
24279 case X86ISD::SBB: return "X86ISD::SBB";
24280 case X86ISD::SMUL: return "X86ISD::SMUL";
24281 case X86ISD::UMUL: return "X86ISD::UMUL";
24282 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24283 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24284 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24285 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24286 case X86ISD::INC: return "X86ISD::INC";
24287 case X86ISD::DEC: return "X86ISD::DEC";
24288 case X86ISD::OR: return "X86ISD::OR";
24289 case X86ISD::XOR: return "X86ISD::XOR";
24290 case X86ISD::AND: return "X86ISD::AND";
24291 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24292 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24293 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24294 case X86ISD::PTEST: return "X86ISD::PTEST";
24295 case X86ISD::TESTP: return "X86ISD::TESTP";
24296 case X86ISD::TESTM: return "X86ISD::TESTM";
24297 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24298 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24299 case X86ISD::KTEST: return "X86ISD::KTEST";
24300 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24301 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24302 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24303 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24304 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24305 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24306 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24307 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24308 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24309 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24310 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24311 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24312 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24313 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24314 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24315 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24316 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24317 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24318 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24319 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24320 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24321 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24322 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24323 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24324 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24325 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24326 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24327 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24328 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24329 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24330 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24331 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24332 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24333 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24334 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24335 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24336 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24337 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24338 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24339 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24340 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24341 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24342 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24343 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24344 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24345 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24346 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24347 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24348 case X86ISD::SAHF: return "X86ISD::SAHF";
24349 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24350 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24351 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24352 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24353 case X86ISD::VPROT: return "X86ISD::VPROT";
24354 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24355 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24356 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24357 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24358 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24359 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24360 case X86ISD::FMADD: return "X86ISD::FMADD";
24361 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24362 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24363 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24364 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24365 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24366 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24367 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24368 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24369 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24370 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24371 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24372 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24373 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24374 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24375 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24376 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24377 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24378 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24379 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24380 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24381 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24382 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24383 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24384 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24385 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24386 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24387 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24388 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24389 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24390 case X86ISD::XTEST: return "X86ISD::XTEST";
24391 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24392 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24393 case X86ISD::SELECT: return "X86ISD::SELECT";
24394 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24395 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24396 case X86ISD::RCP28: return "X86ISD::RCP28";
24397 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24398 case X86ISD::EXP2: return "X86ISD::EXP2";
24399 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24400 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24401 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24402 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24403 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24404 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24405 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24406 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24407 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24408 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24409 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24410 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24411 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24412 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24413 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24414 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24415 case X86ISD::ADDS: return "X86ISD::ADDS";
24416 case X86ISD::SUBS: return "X86ISD::SUBS";
24417 case X86ISD::AVG: return "X86ISD::AVG";
24418 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24419 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24420 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24421 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24422 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24423 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24424 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24425 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24426 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24427 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24428 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24429 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24430 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24431 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24432 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24433 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24434 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24435 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24436 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24437 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24438 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24439 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24440 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24441 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24442 case X86ISD::LWPINS: return "X86ISD::LWPINS";
24447 /// Return true if the addressing mode represented by AM is legal for this
24448 /// target, for a load/store of the specified type.
24449 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24450 const AddrMode &AM, Type *Ty,
24451 unsigned AS) const {
24452 // X86 supports extremely general addressing modes.
24453 CodeModel::Model M = getTargetMachine().getCodeModel();
24455 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24456 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24460 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24462 // If a reference to this global requires an extra load, we can't fold it.
24463 if (isGlobalStubReference(GVFlags))
24466 // If BaseGV requires a register for the PIC base, we cannot also have a
24467 // BaseReg specified.
24468 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24471 // If lower 4G is not available, then we must use rip-relative addressing.
24472 if ((M != CodeModel::Small || isPositionIndependent()) &&
24473 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24477 switch (AM.Scale) {
24483 // These scales always work.
24488 // These scales are formed with basereg+scalereg. Only accept if there is
24493 default: // Other stuff never works.
24500 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24501 unsigned Bits = Ty->getScalarSizeInBits();
24503 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24504 // particularly cheaper than those without.
24508 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24509 // variable shifts just as cheap as scalar ones.
24510 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24513 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24514 // fully general vector.
24518 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24519 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24521 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24522 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24523 return NumBits1 > NumBits2;
24526 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24527 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24530 if (!isTypeLegal(EVT::getEVT(Ty1)))
24533 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24535 // Assuming the caller doesn't have a zeroext or signext return parameter,
24536 // truncation all the way down to i1 is valid.
24540 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24541 return isInt<32>(Imm);
24544 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24545 // Can also use sub to handle negated immediates.
24546 return isInt<32>(Imm);
24549 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24550 if (!VT1.isInteger() || !VT2.isInteger())
24552 unsigned NumBits1 = VT1.getSizeInBits();
24553 unsigned NumBits2 = VT2.getSizeInBits();
24554 return NumBits1 > NumBits2;
24557 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24558 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24559 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24562 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24563 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24564 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24567 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24568 EVT VT1 = Val.getValueType();
24569 if (isZExtFree(VT1, VT2))
24572 if (Val.getOpcode() != ISD::LOAD)
24575 if (!VT1.isSimple() || !VT1.isInteger() ||
24576 !VT2.isSimple() || !VT2.isInteger())
24579 switch (VT1.getSimpleVT().SimpleTy) {
24584 // X86 has 8, 16, and 32-bit zero-extending loads.
24591 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24594 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24595 if (!Subtarget.hasAnyFMA())
24598 VT = VT.getScalarType();
24600 if (!VT.isSimple())
24603 switch (VT.getSimpleVT().SimpleTy) {
24614 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24615 // i16 instructions are longer (0x66 prefix) and potentially slower.
24616 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24619 /// Targets can use this to indicate that they only support *some*
24620 /// VECTOR_SHUFFLE operations, those with specific masks.
24621 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24622 /// are assumed to be legal.
24624 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24626 if (!VT.isSimple())
24629 // Not for i1 vectors
24630 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24633 // Very little shuffling can be done for 64-bit vectors right now.
24634 if (VT.getSimpleVT().getSizeInBits() == 64)
24637 // We only care that the types being shuffled are legal. The lowering can
24638 // handle any possible shuffle mask that results.
24639 return isTypeLegal(VT.getSimpleVT());
24643 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24645 // Just delegate to the generic legality, clear masks aren't special.
24646 return isShuffleMaskLegal(Mask, VT);
24649 //===----------------------------------------------------------------------===//
24650 // X86 Scheduler Hooks
24651 //===----------------------------------------------------------------------===//
24653 /// Utility function to emit xbegin specifying the start of an RTM region.
24654 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24655 const TargetInstrInfo *TII) {
24656 DebugLoc DL = MI.getDebugLoc();
24658 const BasicBlock *BB = MBB->getBasicBlock();
24659 MachineFunction::iterator I = ++MBB->getIterator();
24661 // For the v = xbegin(), we generate
24672 MachineBasicBlock *thisMBB = MBB;
24673 MachineFunction *MF = MBB->getParent();
24674 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24675 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24676 MF->insert(I, mainMBB);
24677 MF->insert(I, sinkMBB);
24679 // Transfer the remainder of BB and its successor edges to sinkMBB.
24680 sinkMBB->splice(sinkMBB->begin(), MBB,
24681 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24682 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24686 // # fallthrough to mainMBB
24687 // # abortion to sinkMBB
24688 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
24689 thisMBB->addSuccessor(mainMBB);
24690 thisMBB->addSuccessor(sinkMBB);
24694 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
24695 mainMBB->addSuccessor(sinkMBB);
24698 // EAX is live into the sinkMBB
24699 sinkMBB->addLiveIn(X86::EAX);
24700 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
24701 MI.getOperand(0).getReg())
24704 MI.eraseFromParent();
24708 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24709 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24710 // in the .td file.
24711 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24712 const TargetInstrInfo *TII) {
24714 switch (MI.getOpcode()) {
24715 default: llvm_unreachable("illegal opcode!");
24716 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24717 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24718 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24719 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24720 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24721 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24722 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24723 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24726 DebugLoc dl = MI.getDebugLoc();
24727 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24729 unsigned NumArgs = MI.getNumOperands();
24730 for (unsigned i = 1; i < NumArgs; ++i) {
24731 MachineOperand &Op = MI.getOperand(i);
24732 if (!(Op.isReg() && Op.isImplicit()))
24735 if (MI.hasOneMemOperand())
24736 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24738 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24739 .addReg(X86::XMM0);
24741 MI.eraseFromParent();
24745 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24746 // defs in an instruction pattern
24747 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24748 const TargetInstrInfo *TII) {
24750 switch (MI.getOpcode()) {
24751 default: llvm_unreachable("illegal opcode!");
24752 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24753 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24754 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24755 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24756 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24757 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24758 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24759 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24762 DebugLoc dl = MI.getDebugLoc();
24763 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24765 unsigned NumArgs = MI.getNumOperands(); // remove the results
24766 for (unsigned i = 1; i < NumArgs; ++i) {
24767 MachineOperand &Op = MI.getOperand(i);
24768 if (!(Op.isReg() && Op.isImplicit()))
24771 if (MI.hasOneMemOperand())
24772 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24774 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24777 MI.eraseFromParent();
24781 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24782 const X86Subtarget &Subtarget) {
24783 DebugLoc dl = MI.getDebugLoc();
24784 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24786 // insert input VAL into EAX
24787 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24788 .addReg(MI.getOperand(0).getReg());
24789 // insert zero to ECX
24790 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24792 // insert zero to EDX
24793 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24795 // insert WRPKRU instruction
24796 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24798 MI.eraseFromParent(); // The pseudo is gone now.
24802 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24803 const X86Subtarget &Subtarget) {
24804 DebugLoc dl = MI.getDebugLoc();
24805 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24807 // insert zero to ECX
24808 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24810 // insert RDPKRU instruction
24811 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24812 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24815 MI.eraseFromParent(); // The pseudo is gone now.
24819 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24820 const X86Subtarget &Subtarget,
24822 DebugLoc dl = MI.getDebugLoc();
24823 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24824 // Address into RAX/EAX, other two args into ECX, EDX.
24825 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24826 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24827 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24828 for (int i = 0; i < X86::AddrNumOperands; ++i)
24829 MIB.add(MI.getOperand(i));
24831 unsigned ValOps = X86::AddrNumOperands;
24832 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24833 .addReg(MI.getOperand(ValOps).getReg());
24834 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24835 .addReg(MI.getOperand(ValOps + 1).getReg());
24837 // The instruction doesn't actually take any operands though.
24838 BuildMI(*BB, MI, dl, TII->get(Opc));
24840 MI.eraseFromParent(); // The pseudo is gone now.
24844 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
24845 const X86Subtarget &Subtarget) {
24846 DebugLoc dl = MI->getDebugLoc();
24847 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24848 // Address into RAX/EAX
24849 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24850 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24851 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24852 for (int i = 0; i < X86::AddrNumOperands; ++i)
24853 MIB.add(MI->getOperand(i));
24855 // The instruction doesn't actually take any operands though.
24856 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
24858 MI->eraseFromParent(); // The pseudo is gone now.
24864 MachineBasicBlock *
24865 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24866 MachineBasicBlock *MBB) const {
24867 // Emit va_arg instruction on X86-64.
24869 // Operands to this pseudo-instruction:
24870 // 0 ) Output : destination address (reg)
24871 // 1-5) Input : va_list address (addr, i64mem)
24872 // 6 ) ArgSize : Size (in bytes) of vararg type
24873 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24874 // 8 ) Align : Alignment of type
24875 // 9 ) EFLAGS (implicit-def)
24877 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24878 static_assert(X86::AddrNumOperands == 5,
24879 "VAARG_64 assumes 5 address operands");
24881 unsigned DestReg = MI.getOperand(0).getReg();
24882 MachineOperand &Base = MI.getOperand(1);
24883 MachineOperand &Scale = MI.getOperand(2);
24884 MachineOperand &Index = MI.getOperand(3);
24885 MachineOperand &Disp = MI.getOperand(4);
24886 MachineOperand &Segment = MI.getOperand(5);
24887 unsigned ArgSize = MI.getOperand(6).getImm();
24888 unsigned ArgMode = MI.getOperand(7).getImm();
24889 unsigned Align = MI.getOperand(8).getImm();
24891 // Memory Reference
24892 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24893 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24894 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24896 // Machine Information
24897 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24898 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24899 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24900 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24901 DebugLoc DL = MI.getDebugLoc();
24903 // struct va_list {
24906 // i64 overflow_area (address)
24907 // i64 reg_save_area (address)
24909 // sizeof(va_list) = 24
24910 // alignment(va_list) = 8
24912 unsigned TotalNumIntRegs = 6;
24913 unsigned TotalNumXMMRegs = 8;
24914 bool UseGPOffset = (ArgMode == 1);
24915 bool UseFPOffset = (ArgMode == 2);
24916 unsigned MaxOffset = TotalNumIntRegs * 8 +
24917 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24919 /* Align ArgSize to a multiple of 8 */
24920 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24921 bool NeedsAlign = (Align > 8);
24923 MachineBasicBlock *thisMBB = MBB;
24924 MachineBasicBlock *overflowMBB;
24925 MachineBasicBlock *offsetMBB;
24926 MachineBasicBlock *endMBB;
24928 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24929 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24930 unsigned OffsetReg = 0;
24932 if (!UseGPOffset && !UseFPOffset) {
24933 // If we only pull from the overflow region, we don't create a branch.
24934 // We don't need to alter control flow.
24935 OffsetDestReg = 0; // unused
24936 OverflowDestReg = DestReg;
24938 offsetMBB = nullptr;
24939 overflowMBB = thisMBB;
24942 // First emit code to check if gp_offset (or fp_offset) is below the bound.
24943 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24944 // If not, pull from overflow_area. (branch to overflowMBB)
24949 // offsetMBB overflowMBB
24954 // Registers for the PHI in endMBB
24955 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24956 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24958 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24959 MachineFunction *MF = MBB->getParent();
24960 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24961 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24962 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24964 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24966 // Insert the new basic blocks
24967 MF->insert(MBBIter, offsetMBB);
24968 MF->insert(MBBIter, overflowMBB);
24969 MF->insert(MBBIter, endMBB);
24971 // Transfer the remainder of MBB and its successor edges to endMBB.
24972 endMBB->splice(endMBB->begin(), thisMBB,
24973 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
24974 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
24976 // Make offsetMBB and overflowMBB successors of thisMBB
24977 thisMBB->addSuccessor(offsetMBB);
24978 thisMBB->addSuccessor(overflowMBB);
24980 // endMBB is a successor of both offsetMBB and overflowMBB
24981 offsetMBB->addSuccessor(endMBB);
24982 overflowMBB->addSuccessor(endMBB);
24984 // Load the offset value into a register
24985 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24986 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
24990 .addDisp(Disp, UseFPOffset ? 4 : 0)
24992 .setMemRefs(MMOBegin, MMOEnd);
24994 // Check if there is enough room left to pull this argument.
24995 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
24997 .addImm(MaxOffset + 8 - ArgSizeA8);
24999 // Branch to "overflowMBB" if offset >= max
25000 // Fall through to "offsetMBB" otherwise
25001 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25002 .addMBB(overflowMBB);
25005 // In offsetMBB, emit code to use the reg_save_area.
25007 assert(OffsetReg != 0);
25009 // Read the reg_save_area address.
25010 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25011 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25017 .setMemRefs(MMOBegin, MMOEnd);
25019 // Zero-extend the offset
25020 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25021 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25024 .addImm(X86::sub_32bit);
25026 // Add the offset to the reg_save_area to get the final address.
25027 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25028 .addReg(OffsetReg64)
25029 .addReg(RegSaveReg);
25031 // Compute the offset for the next argument
25032 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25033 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25035 .addImm(UseFPOffset ? 16 : 8);
25037 // Store it back into the va_list.
25038 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25042 .addDisp(Disp, UseFPOffset ? 4 : 0)
25044 .addReg(NextOffsetReg)
25045 .setMemRefs(MMOBegin, MMOEnd);
25048 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25053 // Emit code to use overflow area
25056 // Load the overflow_area address into a register.
25057 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25058 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25064 .setMemRefs(MMOBegin, MMOEnd);
25066 // If we need to align it, do so. Otherwise, just copy the address
25067 // to OverflowDestReg.
25069 // Align the overflow address
25070 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25071 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25073 // aligned_addr = (addr + (align-1)) & ~(align-1)
25074 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25075 .addReg(OverflowAddrReg)
25078 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25080 .addImm(~(uint64_t)(Align-1));
25082 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25083 .addReg(OverflowAddrReg);
25086 // Compute the next overflow address after this argument.
25087 // (the overflow address should be kept 8-byte aligned)
25088 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25089 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25090 .addReg(OverflowDestReg)
25091 .addImm(ArgSizeA8);
25093 // Store the new overflow address.
25094 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25100 .addReg(NextAddrReg)
25101 .setMemRefs(MMOBegin, MMOEnd);
25103 // If we branched, emit the PHI to the front of endMBB.
25105 BuildMI(*endMBB, endMBB->begin(), DL,
25106 TII->get(X86::PHI), DestReg)
25107 .addReg(OffsetDestReg).addMBB(offsetMBB)
25108 .addReg(OverflowDestReg).addMBB(overflowMBB);
25111 // Erase the pseudo instruction
25112 MI.eraseFromParent();
25117 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25118 MachineInstr &MI, MachineBasicBlock *MBB) const {
25119 // Emit code to save XMM registers to the stack. The ABI says that the
25120 // number of registers to save is given in %al, so it's theoretically
25121 // possible to do an indirect jump trick to avoid saving all of them,
25122 // however this code takes a simpler approach and just executes all
25123 // of the stores if %al is non-zero. It's less code, and it's probably
25124 // easier on the hardware branch predictor, and stores aren't all that
25125 // expensive anyway.
25127 // Create the new basic blocks. One block contains all the XMM stores,
25128 // and one block is the final destination regardless of whether any
25129 // stores were performed.
25130 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25131 MachineFunction *F = MBB->getParent();
25132 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25133 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25134 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25135 F->insert(MBBIter, XMMSaveMBB);
25136 F->insert(MBBIter, EndMBB);
25138 // Transfer the remainder of MBB and its successor edges to EndMBB.
25139 EndMBB->splice(EndMBB->begin(), MBB,
25140 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25141 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25143 // The original block will now fall through to the XMM save block.
25144 MBB->addSuccessor(XMMSaveMBB);
25145 // The XMMSaveMBB will fall through to the end block.
25146 XMMSaveMBB->addSuccessor(EndMBB);
25148 // Now add the instructions.
25149 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25150 DebugLoc DL = MI.getDebugLoc();
25152 unsigned CountReg = MI.getOperand(0).getReg();
25153 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25154 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25156 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25157 // If %al is 0, branch around the XMM save block.
25158 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25159 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25160 MBB->addSuccessor(EndMBB);
25163 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25164 // that was just emitted, but clearly shouldn't be "saved".
25165 assert((MI.getNumOperands() <= 3 ||
25166 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25167 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25168 "Expected last argument to be EFLAGS");
25169 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25170 // In the XMM save block, save all the XMM argument registers.
25171 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25172 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25173 MachineMemOperand *MMO = F->getMachineMemOperand(
25174 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25175 MachineMemOperand::MOStore,
25176 /*Size=*/16, /*Align=*/16);
25177 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25178 .addFrameIndex(RegSaveFrameIndex)
25179 .addImm(/*Scale=*/1)
25180 .addReg(/*IndexReg=*/0)
25181 .addImm(/*Disp=*/Offset)
25182 .addReg(/*Segment=*/0)
25183 .addReg(MI.getOperand(i).getReg())
25184 .addMemOperand(MMO);
25187 MI.eraseFromParent(); // The pseudo instruction is gone now.
25192 // The EFLAGS operand of SelectItr might be missing a kill marker
25193 // because there were multiple uses of EFLAGS, and ISel didn't know
25194 // which to mark. Figure out whether SelectItr should have had a
25195 // kill marker, and set it if it should. Returns the correct kill
25197 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25198 MachineBasicBlock* BB,
25199 const TargetRegisterInfo* TRI) {
25200 // Scan forward through BB for a use/def of EFLAGS.
25201 MachineBasicBlock::iterator miI(std::next(SelectItr));
25202 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25203 const MachineInstr& mi = *miI;
25204 if (mi.readsRegister(X86::EFLAGS))
25206 if (mi.definesRegister(X86::EFLAGS))
25207 break; // Should have kill-flag - update below.
25210 // If we hit the end of the block, check whether EFLAGS is live into a
25212 if (miI == BB->end()) {
25213 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25214 sEnd = BB->succ_end();
25215 sItr != sEnd; ++sItr) {
25216 MachineBasicBlock* succ = *sItr;
25217 if (succ->isLiveIn(X86::EFLAGS))
25222 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25223 // out. SelectMI should have a kill flag on EFLAGS.
25224 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25228 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25229 // together with other CMOV pseudo-opcodes into a single basic-block with
25230 // conditional jump around it.
25231 static bool isCMOVPseudo(MachineInstr &MI) {
25232 switch (MI.getOpcode()) {
25233 case X86::CMOV_FR32:
25234 case X86::CMOV_FR64:
25235 case X86::CMOV_GR8:
25236 case X86::CMOV_GR16:
25237 case X86::CMOV_GR32:
25238 case X86::CMOV_RFP32:
25239 case X86::CMOV_RFP64:
25240 case X86::CMOV_RFP80:
25241 case X86::CMOV_V2F64:
25242 case X86::CMOV_V2I64:
25243 case X86::CMOV_V4F32:
25244 case X86::CMOV_V4F64:
25245 case X86::CMOV_V4I64:
25246 case X86::CMOV_V16F32:
25247 case X86::CMOV_V8F32:
25248 case X86::CMOV_V8F64:
25249 case X86::CMOV_V8I64:
25250 case X86::CMOV_V8I1:
25251 case X86::CMOV_V16I1:
25252 case X86::CMOV_V32I1:
25253 case X86::CMOV_V64I1:
25261 MachineBasicBlock *
25262 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25263 MachineBasicBlock *BB) const {
25264 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25265 DebugLoc DL = MI.getDebugLoc();
25267 // To "insert" a SELECT_CC instruction, we actually have to insert the
25268 // diamond control-flow pattern. The incoming instruction knows the
25269 // destination vreg to set, the condition code register to branch on, the
25270 // true/false values to select between, and a branch opcode to use.
25271 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25272 MachineFunction::iterator It = ++BB->getIterator();
25277 // cmpTY ccX, r1, r2
25279 // fallthrough --> copy0MBB
25280 MachineBasicBlock *thisMBB = BB;
25281 MachineFunction *F = BB->getParent();
25283 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25284 // as described above, by inserting a BB, and then making a PHI at the join
25285 // point to select the true and false operands of the CMOV in the PHI.
25287 // The code also handles two different cases of multiple CMOV opcodes
25291 // In this case, there are multiple CMOVs in a row, all which are based on
25292 // the same condition setting (or the exact opposite condition setting).
25293 // In this case we can lower all the CMOVs using a single inserted BB, and
25294 // then make a number of PHIs at the join point to model the CMOVs. The only
25295 // trickiness here, is that in a case like:
25297 // t2 = CMOV cond1 t1, f1
25298 // t3 = CMOV cond1 t2, f2
25300 // when rewriting this into PHIs, we have to perform some renaming on the
25301 // temps since you cannot have a PHI operand refer to a PHI result earlier
25302 // in the same block. The "simple" but wrong lowering would be:
25304 // t2 = PHI t1(BB1), f1(BB2)
25305 // t3 = PHI t2(BB1), f2(BB2)
25307 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25308 // renaming is to note that on the path through BB1, t2 is really just a
25309 // copy of t1, and do that renaming, properly generating:
25311 // t2 = PHI t1(BB1), f1(BB2)
25312 // t3 = PHI t1(BB1), f2(BB2)
25314 // Case 2, we lower cascaded CMOVs such as
25316 // (CMOV (CMOV F, T, cc1), T, cc2)
25318 // to two successive branches. For that, we look for another CMOV as the
25319 // following instruction.
25321 // Without this, we would add a PHI between the two jumps, which ends up
25322 // creating a few copies all around. For instance, for
25324 // (sitofp (zext (fcmp une)))
25326 // we would generate:
25328 // ucomiss %xmm1, %xmm0
25329 // movss <1.0f>, %xmm0
25330 // movaps %xmm0, %xmm1
25332 // xorps %xmm1, %xmm1
25335 // movaps %xmm1, %xmm0
25339 // because this custom-inserter would have generated:
25351 // A: X = ...; Y = ...
25353 // C: Z = PHI [X, A], [Y, B]
25355 // E: PHI [X, C], [Z, D]
25357 // If we lower both CMOVs in a single step, we can instead generate:
25369 // A: X = ...; Y = ...
25371 // E: PHI [X, A], [X, C], [Y, D]
25373 // Which, in our sitofp/fcmp example, gives us something like:
25375 // ucomiss %xmm1, %xmm0
25376 // movss <1.0f>, %xmm0
25379 // xorps %xmm0, %xmm0
25383 MachineInstr *CascadedCMOV = nullptr;
25384 MachineInstr *LastCMOV = &MI;
25385 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25386 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25387 MachineBasicBlock::iterator NextMIIt =
25388 std::next(MachineBasicBlock::iterator(MI));
25390 // Check for case 1, where there are multiple CMOVs with the same condition
25391 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25392 // number of jumps the most.
25394 if (isCMOVPseudo(MI)) {
25395 // See if we have a string of CMOVS with the same condition.
25396 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25397 (NextMIIt->getOperand(3).getImm() == CC ||
25398 NextMIIt->getOperand(3).getImm() == OppCC)) {
25399 LastCMOV = &*NextMIIt;
25404 // This checks for case 2, but only do this if we didn't already find
25405 // case 1, as indicated by LastCMOV == MI.
25406 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25407 NextMIIt->getOpcode() == MI.getOpcode() &&
25408 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25409 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25410 NextMIIt->getOperand(1).isKill()) {
25411 CascadedCMOV = &*NextMIIt;
25414 MachineBasicBlock *jcc1MBB = nullptr;
25416 // If we have a cascaded CMOV, we lower it to two successive branches to
25417 // the same block. EFLAGS is used by both, so mark it as live in the second.
25418 if (CascadedCMOV) {
25419 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25420 F->insert(It, jcc1MBB);
25421 jcc1MBB->addLiveIn(X86::EFLAGS);
25424 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25425 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25426 F->insert(It, copy0MBB);
25427 F->insert(It, sinkMBB);
25429 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25430 // live into the sink and copy blocks.
25431 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25433 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25434 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25435 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25436 copy0MBB->addLiveIn(X86::EFLAGS);
25437 sinkMBB->addLiveIn(X86::EFLAGS);
25440 // Transfer the remainder of BB and its successor edges to sinkMBB.
25441 sinkMBB->splice(sinkMBB->begin(), BB,
25442 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25443 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25445 // Add the true and fallthrough blocks as its successors.
25446 if (CascadedCMOV) {
25447 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25448 BB->addSuccessor(jcc1MBB);
25450 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25451 // jump to the sinkMBB.
25452 jcc1MBB->addSuccessor(copy0MBB);
25453 jcc1MBB->addSuccessor(sinkMBB);
25455 BB->addSuccessor(copy0MBB);
25458 // The true block target of the first (or only) branch is always sinkMBB.
25459 BB->addSuccessor(sinkMBB);
25461 // Create the conditional branch instruction.
25462 unsigned Opc = X86::GetCondBranchFromCond(CC);
25463 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25465 if (CascadedCMOV) {
25466 unsigned Opc2 = X86::GetCondBranchFromCond(
25467 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25468 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25472 // %FalseValue = ...
25473 // # fallthrough to sinkMBB
25474 copy0MBB->addSuccessor(sinkMBB);
25477 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25479 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25480 MachineBasicBlock::iterator MIItEnd =
25481 std::next(MachineBasicBlock::iterator(LastCMOV));
25482 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25483 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25484 MachineInstrBuilder MIB;
25486 // As we are creating the PHIs, we have to be careful if there is more than
25487 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25488 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25489 // That also means that PHI construction must work forward from earlier to
25490 // later, and that the code must maintain a mapping from earlier PHI's
25491 // destination registers, and the registers that went into the PHI.
25493 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25494 unsigned DestReg = MIIt->getOperand(0).getReg();
25495 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25496 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25498 // If this CMOV we are generating is the opposite condition from
25499 // the jump we generated, then we have to swap the operands for the
25500 // PHI that is going to be generated.
25501 if (MIIt->getOperand(3).getImm() == OppCC)
25502 std::swap(Op1Reg, Op2Reg);
25504 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25505 Op1Reg = RegRewriteTable[Op1Reg].first;
25507 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25508 Op2Reg = RegRewriteTable[Op2Reg].second;
25510 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25511 TII->get(X86::PHI), DestReg)
25512 .addReg(Op1Reg).addMBB(copy0MBB)
25513 .addReg(Op2Reg).addMBB(thisMBB);
25515 // Add this PHI to the rewrite table.
25516 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25519 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25520 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25521 if (CascadedCMOV) {
25522 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25523 // Copy the PHI result to the register defined by the second CMOV.
25524 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25525 DL, TII->get(TargetOpcode::COPY),
25526 CascadedCMOV->getOperand(0).getReg())
25527 .addReg(MI.getOperand(0).getReg());
25528 CascadedCMOV->eraseFromParent();
25531 // Now remove the CMOV(s).
25532 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25533 (MIIt++)->eraseFromParent();
25538 MachineBasicBlock *
25539 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25540 MachineBasicBlock *BB) const {
25541 // Combine the following atomic floating-point modification pattern:
25542 // a.store(reg OP a.load(acquire), release)
25543 // Transform them into:
25544 // OPss (%gpr), %xmm
25545 // movss %xmm, (%gpr)
25546 // Or sd equivalent for 64-bit operations.
25548 switch (MI.getOpcode()) {
25549 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25550 case X86::RELEASE_FADD32mr:
25551 FOp = X86::ADDSSrm;
25552 MOp = X86::MOVSSmr;
25554 case X86::RELEASE_FADD64mr:
25555 FOp = X86::ADDSDrm;
25556 MOp = X86::MOVSDmr;
25559 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25560 DebugLoc DL = MI.getDebugLoc();
25561 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25562 unsigned ValOpIdx = X86::AddrNumOperands;
25563 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25564 MachineInstrBuilder MIB =
25565 BuildMI(*BB, MI, DL, TII->get(FOp),
25566 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25568 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25569 MachineOperand &Operand = MI.getOperand(i);
25570 // Clear any kill flags on register operands as we'll create a second
25571 // instruction using the same address operands.
25572 if (Operand.isReg())
25573 Operand.setIsKill(false);
25576 MachineInstr *FOpMI = MIB;
25577 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25578 for (int i = 0; i < X86::AddrNumOperands; ++i)
25579 MIB.add(MI.getOperand(i));
25580 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25581 MI.eraseFromParent(); // The pseudo instruction is gone now.
25585 MachineBasicBlock *
25586 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25587 MachineBasicBlock *BB) const {
25588 MachineFunction *MF = BB->getParent();
25589 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25590 DebugLoc DL = MI.getDebugLoc();
25591 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25593 assert(MF->shouldSplitStack());
25595 const bool Is64Bit = Subtarget.is64Bit();
25596 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25598 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25599 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25602 // ... [Till the alloca]
25603 // If stacklet is not large enough, jump to mallocMBB
25606 // Allocate by subtracting from RSP
25607 // Jump to continueMBB
25610 // Allocate by call to runtime
25614 // [rest of original BB]
25617 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25618 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25619 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25621 MachineRegisterInfo &MRI = MF->getRegInfo();
25622 const TargetRegisterClass *AddrRegClass =
25623 getRegClassFor(getPointerTy(MF->getDataLayout()));
25625 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25626 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25627 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25628 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25629 sizeVReg = MI.getOperand(1).getReg(),
25631 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25633 MachineFunction::iterator MBBIter = ++BB->getIterator();
25635 MF->insert(MBBIter, bumpMBB);
25636 MF->insert(MBBIter, mallocMBB);
25637 MF->insert(MBBIter, continueMBB);
25639 continueMBB->splice(continueMBB->begin(), BB,
25640 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25641 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25643 // Add code to the main basic block to check if the stack limit has been hit,
25644 // and if so, jump to mallocMBB otherwise to bumpMBB.
25645 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25646 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25647 .addReg(tmpSPVReg).addReg(sizeVReg);
25648 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25649 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25650 .addReg(SPLimitVReg);
25651 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25653 // bumpMBB simply decreases the stack pointer, since we know the current
25654 // stacklet has enough space.
25655 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25656 .addReg(SPLimitVReg);
25657 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25658 .addReg(SPLimitVReg);
25659 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25661 // Calls into a routine in libgcc to allocate more space from the heap.
25662 const uint32_t *RegMask =
25663 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25665 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25667 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25668 .addExternalSymbol("__morestack_allocate_stack_space")
25669 .addRegMask(RegMask)
25670 .addReg(X86::RDI, RegState::Implicit)
25671 .addReg(X86::RAX, RegState::ImplicitDefine);
25672 } else if (Is64Bit) {
25673 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25675 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25676 .addExternalSymbol("__morestack_allocate_stack_space")
25677 .addRegMask(RegMask)
25678 .addReg(X86::EDI, RegState::Implicit)
25679 .addReg(X86::EAX, RegState::ImplicitDefine);
25681 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25683 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25684 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25685 .addExternalSymbol("__morestack_allocate_stack_space")
25686 .addRegMask(RegMask)
25687 .addReg(X86::EAX, RegState::ImplicitDefine);
25691 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25694 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25695 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25696 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25698 // Set up the CFG correctly.
25699 BB->addSuccessor(bumpMBB);
25700 BB->addSuccessor(mallocMBB);
25701 mallocMBB->addSuccessor(continueMBB);
25702 bumpMBB->addSuccessor(continueMBB);
25704 // Take care of the PHI nodes.
25705 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25706 MI.getOperand(0).getReg())
25707 .addReg(mallocPtrVReg)
25709 .addReg(bumpSPPtrVReg)
25712 // Delete the original pseudo instruction.
25713 MI.eraseFromParent();
25716 return continueMBB;
25719 MachineBasicBlock *
25720 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25721 MachineBasicBlock *BB) const {
25722 MachineFunction *MF = BB->getParent();
25723 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25724 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25725 DebugLoc DL = MI.getDebugLoc();
25727 assert(!isAsynchronousEHPersonality(
25728 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25729 "SEH does not use catchret!");
25731 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25732 if (!Subtarget.is32Bit())
25735 // C++ EH creates a new target block to hold the restore code, and wires up
25736 // the new block to the return destination with a normal JMP_4.
25737 MachineBasicBlock *RestoreMBB =
25738 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25739 assert(BB->succ_size() == 1);
25740 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25741 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25742 BB->addSuccessor(RestoreMBB);
25743 MI.getOperand(0).setMBB(RestoreMBB);
25745 auto RestoreMBBI = RestoreMBB->begin();
25746 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25747 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25751 MachineBasicBlock *
25752 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25753 MachineBasicBlock *BB) const {
25754 MachineFunction *MF = BB->getParent();
25755 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25756 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25757 // Only 32-bit SEH requires special handling for catchpad.
25758 if (IsSEH && Subtarget.is32Bit()) {
25759 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25760 DebugLoc DL = MI.getDebugLoc();
25761 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25763 MI.eraseFromParent();
25767 MachineBasicBlock *
25768 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25769 MachineBasicBlock *BB) const {
25770 // So, here we replace TLSADDR with the sequence:
25771 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25772 // We need this because TLSADDR is lowered into calls
25773 // inside MC, therefore without the two markers shrink-wrapping
25774 // may push the prologue/epilogue pass them.
25775 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25776 DebugLoc DL = MI.getDebugLoc();
25777 MachineFunction &MF = *BB->getParent();
25779 // Emit CALLSEQ_START right before the instruction.
25780 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25781 MachineInstrBuilder CallseqStart =
25782 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25783 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25785 // Emit CALLSEQ_END right after the instruction.
25786 // We don't call erase from parent because we want to keep the
25787 // original instruction around.
25788 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25789 MachineInstrBuilder CallseqEnd =
25790 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25791 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25796 MachineBasicBlock *
25797 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25798 MachineBasicBlock *BB) const {
25799 // This is pretty easy. We're taking the value that we received from
25800 // our load from the relocation, sticking it in either RDI (x86-64)
25801 // or EAX and doing an indirect call. The return value will then
25802 // be in the normal return register.
25803 MachineFunction *F = BB->getParent();
25804 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25805 DebugLoc DL = MI.getDebugLoc();
25807 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25808 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25810 // Get a register mask for the lowered call.
25811 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25812 // proper register mask.
25813 const uint32_t *RegMask =
25814 Subtarget.is64Bit() ?
25815 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25816 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25817 if (Subtarget.is64Bit()) {
25818 MachineInstrBuilder MIB =
25819 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25823 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25824 MI.getOperand(3).getTargetFlags())
25826 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25827 addDirectMem(MIB, X86::RDI);
25828 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25829 } else if (!isPositionIndependent()) {
25830 MachineInstrBuilder MIB =
25831 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25835 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25836 MI.getOperand(3).getTargetFlags())
25838 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25839 addDirectMem(MIB, X86::EAX);
25840 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25842 MachineInstrBuilder MIB =
25843 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25844 .addReg(TII->getGlobalBaseReg(F))
25847 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25848 MI.getOperand(3).getTargetFlags())
25850 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25851 addDirectMem(MIB, X86::EAX);
25852 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25855 MI.eraseFromParent(); // The pseudo instruction is gone now.
25859 MachineBasicBlock *
25860 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25861 MachineBasicBlock *MBB) const {
25862 DebugLoc DL = MI.getDebugLoc();
25863 MachineFunction *MF = MBB->getParent();
25864 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25865 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25866 MachineRegisterInfo &MRI = MF->getRegInfo();
25868 const BasicBlock *BB = MBB->getBasicBlock();
25869 MachineFunction::iterator I = ++MBB->getIterator();
25871 // Memory Reference
25872 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25873 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25876 unsigned MemOpndSlot = 0;
25878 unsigned CurOp = 0;
25880 DstReg = MI.getOperand(CurOp++).getReg();
25881 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25882 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
25884 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25885 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25887 MemOpndSlot = CurOp;
25889 MVT PVT = getPointerTy(MF->getDataLayout());
25890 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25891 "Invalid Pointer Size!");
25893 // For v = setjmp(buf), we generate
25896 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25897 // SjLjSetup restoreMBB
25903 // v = phi(main, restore)
25906 // if base pointer being used, load it from frame
25909 MachineBasicBlock *thisMBB = MBB;
25910 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25911 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25912 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25913 MF->insert(I, mainMBB);
25914 MF->insert(I, sinkMBB);
25915 MF->push_back(restoreMBB);
25916 restoreMBB->setHasAddressTaken();
25918 MachineInstrBuilder MIB;
25920 // Transfer the remainder of BB and its successor edges to sinkMBB.
25921 sinkMBB->splice(sinkMBB->begin(), MBB,
25922 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25923 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25926 unsigned PtrStoreOpc = 0;
25927 unsigned LabelReg = 0;
25928 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25929 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25930 !isPositionIndependent();
25932 // Prepare IP either in reg or imm.
25933 if (!UseImmLabel) {
25934 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25935 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25936 LabelReg = MRI.createVirtualRegister(PtrRC);
25937 if (Subtarget.is64Bit()) {
25938 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25942 .addMBB(restoreMBB)
25945 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25946 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25947 .addReg(XII->getGlobalBaseReg(MF))
25950 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25954 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25956 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25957 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25958 if (i == X86::AddrDisp)
25959 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
25961 MIB.add(MI.getOperand(MemOpndSlot + i));
25964 MIB.addReg(LabelReg);
25966 MIB.addMBB(restoreMBB);
25967 MIB.setMemRefs(MMOBegin, MMOEnd);
25969 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
25970 .addMBB(restoreMBB);
25972 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25973 MIB.addRegMask(RegInfo->getNoPreservedMask());
25974 thisMBB->addSuccessor(mainMBB);
25975 thisMBB->addSuccessor(restoreMBB);
25979 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
25980 mainMBB->addSuccessor(sinkMBB);
25983 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
25984 TII->get(X86::PHI), DstReg)
25985 .addReg(mainDstReg).addMBB(mainMBB)
25986 .addReg(restoreDstReg).addMBB(restoreMBB);
25989 if (RegInfo->hasBasePointer(*MF)) {
25990 const bool Uses64BitFramePtr =
25991 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25992 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
25993 X86FI->setRestoreBasePointer(MF);
25994 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
25995 unsigned BasePtr = RegInfo->getBaseRegister();
25996 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
25997 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
25998 FramePtr, true, X86FI->getRestoreBasePointerOffset())
25999 .setMIFlag(MachineInstr::FrameSetup);
26001 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26002 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26003 restoreMBB->addSuccessor(sinkMBB);
26005 MI.eraseFromParent();
26009 MachineBasicBlock *
26010 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26011 MachineBasicBlock *MBB) const {
26012 DebugLoc DL = MI.getDebugLoc();
26013 MachineFunction *MF = MBB->getParent();
26014 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26015 MachineRegisterInfo &MRI = MF->getRegInfo();
26017 // Memory Reference
26018 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26019 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26021 MVT PVT = getPointerTy(MF->getDataLayout());
26022 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26023 "Invalid Pointer Size!");
26025 const TargetRegisterClass *RC =
26026 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26027 unsigned Tmp = MRI.createVirtualRegister(RC);
26028 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26029 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26030 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26031 unsigned SP = RegInfo->getStackRegister();
26033 MachineInstrBuilder MIB;
26035 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26036 const int64_t SPOffset = 2 * PVT.getStoreSize();
26038 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26039 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26042 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26043 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26044 MIB.add(MI.getOperand(i));
26045 MIB.setMemRefs(MMOBegin, MMOEnd);
26047 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26048 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26049 if (i == X86::AddrDisp)
26050 MIB.addDisp(MI.getOperand(i), LabelOffset);
26052 MIB.add(MI.getOperand(i));
26054 MIB.setMemRefs(MMOBegin, MMOEnd);
26056 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26057 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26058 if (i == X86::AddrDisp)
26059 MIB.addDisp(MI.getOperand(i), SPOffset);
26061 MIB.add(MI.getOperand(i));
26063 MIB.setMemRefs(MMOBegin, MMOEnd);
26065 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26067 MI.eraseFromParent();
26071 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26072 MachineBasicBlock *MBB,
26073 MachineBasicBlock *DispatchBB,
26075 DebugLoc DL = MI.getDebugLoc();
26076 MachineFunction *MF = MBB->getParent();
26077 MachineRegisterInfo *MRI = &MF->getRegInfo();
26078 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26080 MVT PVT = getPointerTy(MF->getDataLayout());
26081 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26086 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26087 !isPositionIndependent();
26090 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26092 const TargetRegisterClass *TRC =
26093 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26094 VR = MRI->createVirtualRegister(TRC);
26095 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26097 if (Subtarget.is64Bit())
26098 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26102 .addMBB(DispatchBB)
26105 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26106 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26109 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26113 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26114 addFrameReference(MIB, FI, 36);
26116 MIB.addMBB(DispatchBB);
26121 MachineBasicBlock *
26122 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26123 MachineBasicBlock *BB) const {
26124 DebugLoc DL = MI.getDebugLoc();
26125 MachineFunction *MF = BB->getParent();
26126 MachineFrameInfo &MFI = MF->getFrameInfo();
26127 MachineRegisterInfo *MRI = &MF->getRegInfo();
26128 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26129 int FI = MFI.getFunctionContextIndex();
26131 // Get a mapping of the call site numbers to all of the landing pads they're
26132 // associated with.
26133 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26134 unsigned MaxCSNum = 0;
26135 for (auto &MBB : *MF) {
26136 if (!MBB.isEHPad())
26139 MCSymbol *Sym = nullptr;
26140 for (const auto &MI : MBB) {
26141 if (MI.isDebugValue())
26144 assert(MI.isEHLabel() && "expected EH_LABEL");
26145 Sym = MI.getOperand(0).getMCSymbol();
26149 if (!MF->hasCallSiteLandingPad(Sym))
26152 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26153 CallSiteNumToLPad[CSI].push_back(&MBB);
26154 MaxCSNum = std::max(MaxCSNum, CSI);
26158 // Get an ordered list of the machine basic blocks for the jump table.
26159 std::vector<MachineBasicBlock *> LPadList;
26160 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26161 LPadList.reserve(CallSiteNumToLPad.size());
26163 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26164 for (auto &LP : CallSiteNumToLPad[CSI]) {
26165 LPadList.push_back(LP);
26166 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26170 assert(!LPadList.empty() &&
26171 "No landing pad destinations for the dispatch jump table!");
26173 // Create the MBBs for the dispatch code.
26175 // Shove the dispatch's address into the return slot in the function context.
26176 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26177 DispatchBB->setIsEHPad(true);
26179 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26180 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26181 DispatchBB->addSuccessor(TrapBB);
26183 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26184 DispatchBB->addSuccessor(DispContBB);
26187 MF->push_back(DispatchBB);
26188 MF->push_back(DispContBB);
26189 MF->push_back(TrapBB);
26191 // Insert code into the entry block that creates and registers the function
26193 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26195 // Create the jump table and associated information
26196 MachineJumpTableInfo *JTI =
26197 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26198 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26200 const X86RegisterInfo &RI = TII->getRegisterInfo();
26201 // Add a register mask with no preserved registers. This results in all
26202 // registers being marked as clobbered.
26203 if (RI.hasBasePointer(*MF)) {
26204 const bool FPIs64Bit =
26205 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26206 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26207 MFI->setRestoreBasePointer(MF);
26209 unsigned FP = RI.getFrameRegister(*MF);
26210 unsigned BP = RI.getBaseRegister();
26211 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26212 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26213 MFI->getRestoreBasePointerOffset())
26214 .addRegMask(RI.getNoPreservedMask());
26216 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26217 .addRegMask(RI.getNoPreservedMask());
26220 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26221 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26223 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26225 .addImm(LPadList.size());
26226 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26228 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26229 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26232 BuildMI(DispContBB, DL,
26233 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26235 .addImm(Subtarget.is64Bit() ? 8 : 4)
26237 .addJumpTableIndex(MJTI)
26240 // Add the jump table entries as successors to the MBB.
26241 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26242 for (auto &LP : LPadList)
26243 if (SeenMBBs.insert(LP).second)
26244 DispContBB->addSuccessor(LP);
26246 // N.B. the order the invoke BBs are processed in doesn't matter here.
26247 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26248 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26249 for (MachineBasicBlock *MBB : InvokeBBs) {
26250 // Remove the landing pad successor from the invoke block and replace it
26251 // with the new dispatch block.
26252 // Keep a copy of Successors since it's modified inside the loop.
26253 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26255 // FIXME: Avoid quadratic complexity.
26256 for (auto MBBS : Successors) {
26257 if (MBBS->isEHPad()) {
26258 MBB->removeSuccessor(MBBS);
26259 MBBLPads.push_back(MBBS);
26263 MBB->addSuccessor(DispatchBB);
26265 // Find the invoke call and mark all of the callee-saved registers as
26266 // 'implicit defined' so that they're spilled. This prevents code from
26267 // moving instructions to before the EH block, where they will never be
26269 for (auto &II : reverse(*MBB)) {
26273 DenseMap<unsigned, bool> DefRegs;
26274 for (auto &MOp : II.operands())
26276 DefRegs[MOp.getReg()] = true;
26278 MachineInstrBuilder MIB(*MF, &II);
26279 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26280 unsigned Reg = SavedRegs[RI];
26282 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26289 // Mark all former landing pads as non-landing pads. The dispatch is the only
26290 // landing pad now.
26291 for (auto &LP : MBBLPads)
26292 LP->setIsEHPad(false);
26294 // The instruction is gone now.
26295 MI.eraseFromParent();
26299 MachineBasicBlock *
26300 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26301 MachineBasicBlock *BB) const {
26302 MachineFunction *MF = BB->getParent();
26303 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26304 DebugLoc DL = MI.getDebugLoc();
26306 switch (MI.getOpcode()) {
26307 default: llvm_unreachable("Unexpected instr type to insert");
26308 case X86::TAILJMPd64:
26309 case X86::TAILJMPr64:
26310 case X86::TAILJMPm64:
26311 case X86::TAILJMPr64_REX:
26312 case X86::TAILJMPm64_REX:
26313 llvm_unreachable("TAILJMP64 would not be touched here.");
26314 case X86::TCRETURNdi64:
26315 case X86::TCRETURNri64:
26316 case X86::TCRETURNmi64:
26318 case X86::TLS_addr32:
26319 case X86::TLS_addr64:
26320 case X86::TLS_base_addr32:
26321 case X86::TLS_base_addr64:
26322 return EmitLoweredTLSAddr(MI, BB);
26323 case X86::CATCHRET:
26324 return EmitLoweredCatchRet(MI, BB);
26325 case X86::CATCHPAD:
26326 return EmitLoweredCatchPad(MI, BB);
26327 case X86::SEG_ALLOCA_32:
26328 case X86::SEG_ALLOCA_64:
26329 return EmitLoweredSegAlloca(MI, BB);
26330 case X86::TLSCall_32:
26331 case X86::TLSCall_64:
26332 return EmitLoweredTLSCall(MI, BB);
26333 case X86::CMOV_FR32:
26334 case X86::CMOV_FR64:
26335 case X86::CMOV_FR128:
26336 case X86::CMOV_GR8:
26337 case X86::CMOV_GR16:
26338 case X86::CMOV_GR32:
26339 case X86::CMOV_RFP32:
26340 case X86::CMOV_RFP64:
26341 case X86::CMOV_RFP80:
26342 case X86::CMOV_V2F64:
26343 case X86::CMOV_V2I64:
26344 case X86::CMOV_V4F32:
26345 case X86::CMOV_V4F64:
26346 case X86::CMOV_V4I64:
26347 case X86::CMOV_V16F32:
26348 case X86::CMOV_V8F32:
26349 case X86::CMOV_V8F64:
26350 case X86::CMOV_V8I64:
26351 case X86::CMOV_V8I1:
26352 case X86::CMOV_V16I1:
26353 case X86::CMOV_V32I1:
26354 case X86::CMOV_V64I1:
26355 return EmitLoweredSelect(MI, BB);
26357 case X86::RDFLAGS32:
26358 case X86::RDFLAGS64: {
26360 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26361 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26362 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26363 // Permit reads of the FLAGS register without it being defined.
26364 // This intrinsic exists to read external processor state in flags, such as
26365 // the trap flag, interrupt flag, and direction flag, none of which are
26366 // modeled by the backend.
26367 Push->getOperand(2).setIsUndef();
26368 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26370 MI.eraseFromParent(); // The pseudo is gone now.
26374 case X86::WRFLAGS32:
26375 case X86::WRFLAGS64: {
26377 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26379 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26380 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26381 BuildMI(*BB, MI, DL, TII->get(PopF));
26383 MI.eraseFromParent(); // The pseudo is gone now.
26387 case X86::RELEASE_FADD32mr:
26388 case X86::RELEASE_FADD64mr:
26389 return EmitLoweredAtomicFP(MI, BB);
26391 case X86::FP32_TO_INT16_IN_MEM:
26392 case X86::FP32_TO_INT32_IN_MEM:
26393 case X86::FP32_TO_INT64_IN_MEM:
26394 case X86::FP64_TO_INT16_IN_MEM:
26395 case X86::FP64_TO_INT32_IN_MEM:
26396 case X86::FP64_TO_INT64_IN_MEM:
26397 case X86::FP80_TO_INT16_IN_MEM:
26398 case X86::FP80_TO_INT32_IN_MEM:
26399 case X86::FP80_TO_INT64_IN_MEM: {
26400 // Change the floating point control register to use "round towards zero"
26401 // mode when truncating to an integer value.
26402 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26403 addFrameReference(BuildMI(*BB, MI, DL,
26404 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26406 // Load the old value of the high byte of the control word...
26408 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26409 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26412 // Set the high part to be round to zero...
26413 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26416 // Reload the modified control word now...
26417 addFrameReference(BuildMI(*BB, MI, DL,
26418 TII->get(X86::FLDCW16m)), CWFrameIdx);
26420 // Restore the memory image of control word to original value
26421 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26424 // Get the X86 opcode to use.
26426 switch (MI.getOpcode()) {
26427 default: llvm_unreachable("illegal opcode!");
26428 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26429 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26430 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26431 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26432 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26433 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26434 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26435 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26436 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26439 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26440 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26441 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26443 // Reload the original control word now.
26444 addFrameReference(BuildMI(*BB, MI, DL,
26445 TII->get(X86::FLDCW16m)), CWFrameIdx);
26447 MI.eraseFromParent(); // The pseudo instruction is gone now.
26450 // String/text processing lowering.
26451 case X86::PCMPISTRM128REG:
26452 case X86::VPCMPISTRM128REG:
26453 case X86::PCMPISTRM128MEM:
26454 case X86::VPCMPISTRM128MEM:
26455 case X86::PCMPESTRM128REG:
26456 case X86::VPCMPESTRM128REG:
26457 case X86::PCMPESTRM128MEM:
26458 case X86::VPCMPESTRM128MEM:
26459 assert(Subtarget.hasSSE42() &&
26460 "Target must have SSE4.2 or AVX features enabled");
26461 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26463 // String/text processing lowering.
26464 case X86::PCMPISTRIREG:
26465 case X86::VPCMPISTRIREG:
26466 case X86::PCMPISTRIMEM:
26467 case X86::VPCMPISTRIMEM:
26468 case X86::PCMPESTRIREG:
26469 case X86::VPCMPESTRIREG:
26470 case X86::PCMPESTRIMEM:
26471 case X86::VPCMPESTRIMEM:
26472 assert(Subtarget.hasSSE42() &&
26473 "Target must have SSE4.2 or AVX features enabled");
26474 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26476 // Thread synchronization.
26478 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26479 case X86::MONITORX:
26480 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26484 return emitClzero(&MI, BB, Subtarget);
26488 return emitWRPKRU(MI, BB, Subtarget);
26490 return emitRDPKRU(MI, BB, Subtarget);
26493 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26495 case X86::VASTART_SAVE_XMM_REGS:
26496 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26498 case X86::VAARG_64:
26499 return EmitVAARG64WithCustomInserter(MI, BB);
26501 case X86::EH_SjLj_SetJmp32:
26502 case X86::EH_SjLj_SetJmp64:
26503 return emitEHSjLjSetJmp(MI, BB);
26505 case X86::EH_SjLj_LongJmp32:
26506 case X86::EH_SjLj_LongJmp64:
26507 return emitEHSjLjLongJmp(MI, BB);
26509 case X86::Int_eh_sjlj_setup_dispatch:
26510 return EmitSjLjDispatchBlock(MI, BB);
26512 case TargetOpcode::STATEPOINT:
26513 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26514 // this point in the process. We diverge later.
26515 return emitPatchPoint(MI, BB);
26517 case TargetOpcode::STACKMAP:
26518 case TargetOpcode::PATCHPOINT:
26519 return emitPatchPoint(MI, BB);
26521 case TargetOpcode::PATCHABLE_EVENT_CALL:
26522 // Do nothing here, handle in xray instrumentation pass.
26525 case X86::LCMPXCHG8B: {
26526 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26527 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26528 // requires a memory operand. If it happens that current architecture is
26529 // i686 and for current function we need a base pointer
26530 // - which is ESI for i686 - register allocator would not be able to
26531 // allocate registers for an address in form of X(%reg, %reg, Y)
26532 // - there never would be enough unreserved registers during regalloc
26533 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26534 // We are giving a hand to register allocator by precomputing the address in
26535 // a new vreg using LEA.
26537 // If it is not i686 or there is no base pointer - nothing to do here.
26538 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26541 // Even though this code does not necessarily needs the base pointer to
26542 // be ESI, we check for that. The reason: if this assert fails, there are
26543 // some changes happened in the compiler base pointer handling, which most
26544 // probably have to be addressed somehow here.
26545 assert(TRI->getBaseRegister() == X86::ESI &&
26546 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26547 "base pointer in mind");
26549 MachineRegisterInfo &MRI = MF->getRegInfo();
26550 MVT SPTy = getPointerTy(MF->getDataLayout());
26551 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26552 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26554 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26555 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26556 // does not use index register.
26557 if (AM.IndexReg == X86::NoRegister)
26560 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26561 // four operand definitions that are E[ABCD] registers. We skip them and
26562 // then insert the LEA.
26563 MachineBasicBlock::iterator MBBI(MI);
26564 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26565 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26568 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26570 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26574 case X86::LCMPXCHG16B:
26576 case X86::LCMPXCHG8B_SAVE_EBX:
26577 case X86::LCMPXCHG16B_SAVE_RBX: {
26579 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26580 if (!BB->isLiveIn(BasePtr))
26581 BB->addLiveIn(BasePtr);
26587 //===----------------------------------------------------------------------===//
26588 // X86 Optimization Hooks
26589 //===----------------------------------------------------------------------===//
26591 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26593 const APInt &DemandedElts,
26594 const SelectionDAG &DAG,
26595 unsigned Depth) const {
26596 unsigned BitWidth = Known.getBitWidth();
26597 unsigned Opc = Op.getOpcode();
26598 EVT VT = Op.getValueType();
26599 assert((Opc >= ISD::BUILTIN_OP_END ||
26600 Opc == ISD::INTRINSIC_WO_CHAIN ||
26601 Opc == ISD::INTRINSIC_W_CHAIN ||
26602 Opc == ISD::INTRINSIC_VOID) &&
26603 "Should use MaskedValueIsZero if you don't know whether Op"
26604 " is a target node!");
26620 // These nodes' second result is a boolean.
26621 if (Op.getResNo() == 0)
26624 case X86ISD::SETCC:
26625 Known.Zero.setBitsFrom(1);
26627 case X86ISD::MOVMSK: {
26628 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26629 Known.Zero.setBitsFrom(NumLoBits);
26632 case X86ISD::VSHLI:
26633 case X86ISD::VSRLI: {
26634 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26635 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26636 Known.setAllZero();
26640 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26641 unsigned ShAmt = ShiftImm->getZExtValue();
26642 if (Opc == X86ISD::VSHLI) {
26643 Known.Zero <<= ShAmt;
26644 Known.One <<= ShAmt;
26645 // Low bits are known zero.
26646 Known.Zero.setLowBits(ShAmt);
26648 Known.Zero.lshrInPlace(ShAmt);
26649 Known.One.lshrInPlace(ShAmt);
26650 // High bits are known zero.
26651 Known.Zero.setHighBits(ShAmt);
26656 case X86ISD::VZEXT: {
26657 SDValue N0 = Op.getOperand(0);
26658 unsigned NumElts = VT.getVectorNumElements();
26660 EVT SrcVT = N0.getValueType();
26661 unsigned InNumElts = SrcVT.getVectorNumElements();
26662 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26663 assert(InNumElts >= NumElts && "Illegal VZEXT input");
26665 Known = KnownBits(InBitWidth);
26666 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26667 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
26668 Known = Known.zext(BitWidth);
26669 Known.Zero.setBitsFrom(InBitWidth);
26675 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26676 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26677 unsigned Depth) const {
26678 unsigned VTBits = Op.getScalarValueSizeInBits();
26679 unsigned Opcode = Op.getOpcode();
26681 case X86ISD::SETCC_CARRY:
26682 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26685 case X86ISD::VSEXT: {
26686 SDValue Src = Op.getOperand(0);
26687 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26688 Tmp += VTBits - Src.getScalarValueSizeInBits();
26692 case X86ISD::VSRAI: {
26693 SDValue Src = Op.getOperand(0);
26694 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26695 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26697 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26700 case X86ISD::PCMPGT:
26701 case X86ISD::PCMPEQ:
26703 case X86ISD::VPCOM:
26704 case X86ISD::VPCOMU:
26705 // Vector compares return zero/all-bits result values.
26713 /// Returns true (and the GlobalValue and the offset) if the node is a
26714 /// GlobalAddress + offset.
26715 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26716 const GlobalValue* &GA,
26717 int64_t &Offset) const {
26718 if (N->getOpcode() == X86ISD::Wrapper) {
26719 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26720 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26721 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26725 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26728 // Attempt to match a combined shuffle mask against supported unary shuffle
26730 // TODO: Investigate sharing more of this with shuffle lowering.
26731 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26732 bool AllowFloatDomain, bool AllowIntDomain,
26733 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26734 const X86Subtarget &Subtarget,
26735 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26736 unsigned NumMaskElts = Mask.size();
26737 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26739 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26740 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26741 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
26742 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26743 unsigned MaxScale = 64 / MaskEltSize;
26744 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26746 unsigned NumDstElts = NumMaskElts / Scale;
26747 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26748 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26749 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26752 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
26753 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
26754 if (SrcVT != MaskVT)
26755 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
26756 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26757 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26758 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
26759 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
26765 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26766 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26767 isUndefOrEqual(Mask[0], 0) &&
26768 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26769 Shuffle = X86ISD::VZEXT_MOVL;
26770 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26774 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26775 // instructions are no slower than UNPCKLPD but has the option to
26776 // fold the input operand into even an unaligned memory load.
26777 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
26778 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26779 Shuffle = X86ISD::MOVDDUP;
26780 SrcVT = DstVT = MVT::v2f64;
26783 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26784 Shuffle = X86ISD::MOVSLDUP;
26785 SrcVT = DstVT = MVT::v4f32;
26788 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26789 Shuffle = X86ISD::MOVSHDUP;
26790 SrcVT = DstVT = MVT::v4f32;
26795 if (MaskVT.is256BitVector() && AllowFloatDomain) {
26796 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26797 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26798 Shuffle = X86ISD::MOVDDUP;
26799 SrcVT = DstVT = MVT::v4f64;
26802 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26803 Shuffle = X86ISD::MOVSLDUP;
26804 SrcVT = DstVT = MVT::v8f32;
26807 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26808 Shuffle = X86ISD::MOVSHDUP;
26809 SrcVT = DstVT = MVT::v8f32;
26814 if (MaskVT.is512BitVector() && AllowFloatDomain) {
26815 assert(Subtarget.hasAVX512() &&
26816 "AVX512 required for 512-bit vector shuffles");
26817 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26818 Shuffle = X86ISD::MOVDDUP;
26819 SrcVT = DstVT = MVT::v8f64;
26822 if (isTargetShuffleEquivalent(
26823 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26824 Shuffle = X86ISD::MOVSLDUP;
26825 SrcVT = DstVT = MVT::v16f32;
26828 if (isTargetShuffleEquivalent(
26829 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26830 Shuffle = X86ISD::MOVSHDUP;
26831 SrcVT = DstVT = MVT::v16f32;
26836 // Attempt to match against broadcast-from-vector.
26837 if (Subtarget.hasAVX2()) {
26838 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26839 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26840 SrcVT = DstVT = MaskVT;
26841 Shuffle = X86ISD::VBROADCAST;
26849 // Attempt to match a combined shuffle mask against supported unary immediate
26850 // permute instructions.
26851 // TODO: Investigate sharing more of this with shuffle lowering.
26852 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26853 bool AllowFloatDomain,
26854 bool AllowIntDomain,
26855 const X86Subtarget &Subtarget,
26856 unsigned &Shuffle, MVT &ShuffleVT,
26857 unsigned &PermuteImm) {
26858 unsigned NumMaskElts = Mask.size();
26860 bool ContainsZeros = false;
26861 APInt Zeroable(NumMaskElts, false);
26862 for (unsigned i = 0; i != NumMaskElts; ++i) {
26864 if (isUndefOrZero(M))
26865 Zeroable.setBit(i);
26866 ContainsZeros |= (M == SM_SentinelZero);
26869 // Attempt to match against byte/bit shifts.
26870 // FIXME: Add 512-bit support.
26871 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26872 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26873 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26874 MaskVT.getScalarSizeInBits(), Mask,
26875 0, Zeroable, Subtarget);
26876 if (0 < ShiftAmt) {
26877 PermuteImm = (unsigned)ShiftAmt;
26882 // Ensure we don't contain any zero elements.
26886 assert(llvm::all_of(Mask, [&](int M) {
26887 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26888 }) && "Expected unary shuffle");
26890 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26891 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26892 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26894 // Handle PSHUFLW/PSHUFHW repeated patterns.
26895 if (MaskScalarSizeInBits == 16) {
26896 SmallVector<int, 4> RepeatedMask;
26897 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26898 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26899 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26901 // PSHUFLW: permute lower 4 elements only.
26902 if (isUndefOrInRange(LoMask, 0, 4) &&
26903 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26904 Shuffle = X86ISD::PSHUFLW;
26905 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26906 PermuteImm = getV4X86ShuffleImm(LoMask);
26910 // PSHUFHW: permute upper 4 elements only.
26911 if (isUndefOrInRange(HiMask, 4, 8) &&
26912 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26913 // Offset the HiMask so that we can create the shuffle immediate.
26914 int OffsetHiMask[4];
26915 for (int i = 0; i != 4; ++i)
26916 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26918 Shuffle = X86ISD::PSHUFHW;
26919 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26920 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26929 // We only support permutation of 32/64 bit elements after this.
26930 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26933 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26934 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26935 if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
26938 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26939 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
26940 AllowFloatDomain = true;
26941 AllowIntDomain = false;
26944 // Check for lane crossing permutes.
26945 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26946 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26947 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26948 Shuffle = X86ISD::VPERMI;
26949 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
26950 PermuteImm = getV4X86ShuffleImm(Mask);
26953 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26954 SmallVector<int, 4> RepeatedMask;
26955 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26956 Shuffle = X86ISD::VPERMI;
26957 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
26958 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
26965 // VPERMILPD can permute with a non-repeating shuffle.
26966 if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
26967 Shuffle = X86ISD::VPERMILPI;
26968 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
26970 for (int i = 0, e = Mask.size(); i != e; ++i) {
26972 if (M == SM_SentinelUndef)
26974 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
26975 PermuteImm |= (M & 1) << i;
26980 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
26981 SmallVector<int, 4> RepeatedMask;
26982 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
26985 // Narrow the repeated mask for 32-bit element permutes.
26986 SmallVector<int, 4> WordMask = RepeatedMask;
26987 if (MaskScalarSizeInBits == 64)
26988 scaleShuffleMask(2, RepeatedMask, WordMask);
26990 Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
26991 ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
26992 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
26993 PermuteImm = getV4X86ShuffleImm(WordMask);
26997 // Attempt to match a combined unary shuffle mask against supported binary
26998 // shuffle instructions.
26999 // TODO: Investigate sharing more of this with shuffle lowering.
27000 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27001 bool AllowFloatDomain, bool AllowIntDomain,
27002 SDValue &V1, SDValue &V2, SDLoc &DL,
27004 const X86Subtarget &Subtarget,
27005 unsigned &Shuffle, MVT &ShuffleVT,
27007 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27009 if (MaskVT.is128BitVector()) {
27010 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27012 Shuffle = X86ISD::MOVLHPS;
27013 ShuffleVT = MVT::v4f32;
27016 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27018 Shuffle = X86ISD::MOVHLPS;
27019 ShuffleVT = MVT::v4f32;
27022 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27023 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27025 Shuffle = X86ISD::MOVSD;
27026 ShuffleVT = MaskVT;
27029 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27030 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27031 Shuffle = X86ISD::MOVSS;
27032 ShuffleVT = MaskVT;
27037 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27038 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27039 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27040 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27041 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27042 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27043 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27045 ShuffleVT = MaskVT;
27046 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27047 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27055 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27056 bool AllowFloatDomain,
27057 bool AllowIntDomain,
27058 SDValue &V1, SDValue &V2, SDLoc &DL,
27060 const X86Subtarget &Subtarget,
27061 unsigned &Shuffle, MVT &ShuffleVT,
27062 unsigned &PermuteImm) {
27063 unsigned NumMaskElts = Mask.size();
27064 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27066 // Attempt to match against PALIGNR byte rotate.
27067 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27068 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27069 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27070 if (0 < ByteRotation) {
27071 Shuffle = X86ISD::PALIGNR;
27072 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27073 PermuteImm = ByteRotation;
27078 // Attempt to combine to X86ISD::BLENDI.
27079 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27080 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27081 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27082 uint64_t BlendMask = 0;
27083 bool ForceV1Zero = false, ForceV2Zero = false;
27084 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27085 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27087 if (MaskVT == MVT::v16i16) {
27088 // We can only use v16i16 PBLENDW if the lanes are repeated.
27089 SmallVector<int, 8> RepeatedMask;
27090 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27092 assert(RepeatedMask.size() == 8 &&
27093 "Repeated mask size doesn't match!");
27095 for (int i = 0; i < 8; ++i)
27096 if (RepeatedMask[i] >= 8)
27097 PermuteImm |= 1 << i;
27098 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27099 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27100 Shuffle = X86ISD::BLENDI;
27101 ShuffleVT = MaskVT;
27105 // Determine a type compatible with X86ISD::BLENDI.
27106 ShuffleVT = MaskVT;
27107 if (Subtarget.hasAVX2()) {
27108 if (ShuffleVT == MVT::v4i64)
27109 ShuffleVT = MVT::v8i32;
27110 else if (ShuffleVT == MVT::v2i64)
27111 ShuffleVT = MVT::v4i32;
27113 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27114 ShuffleVT = MVT::v8i16;
27115 else if (ShuffleVT == MVT::v4i64)
27116 ShuffleVT = MVT::v4f64;
27117 else if (ShuffleVT == MVT::v8i32)
27118 ShuffleVT = MVT::v8f32;
27121 if (!ShuffleVT.isFloatingPoint()) {
27122 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27124 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27125 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27126 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27129 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27130 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27131 PermuteImm = (unsigned)BlendMask;
27132 Shuffle = X86ISD::BLENDI;
27138 // Attempt to combine to INSERTPS.
27139 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27140 MaskVT.is128BitVector()) {
27141 APInt Zeroable(4, 0);
27142 for (unsigned i = 0; i != NumMaskElts; ++i)
27144 Zeroable.setBit(i);
27146 if (Zeroable.getBoolValue() &&
27147 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27148 Shuffle = X86ISD::INSERTPS;
27149 ShuffleVT = MVT::v4f32;
27154 // Attempt to combine to SHUFPD.
27155 if (AllowFloatDomain && EltSizeInBits == 64 &&
27156 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27157 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27158 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27159 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27160 Shuffle = X86ISD::SHUFP;
27161 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27166 // Attempt to combine to SHUFPS.
27167 if (AllowFloatDomain && EltSizeInBits == 32 &&
27168 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27169 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27170 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27171 SmallVector<int, 4> RepeatedMask;
27172 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27173 // Match each half of the repeated mask, to determine if its just
27174 // referencing one of the vectors, is zeroable or entirely undef.
27175 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27176 int M0 = RepeatedMask[Offset];
27177 int M1 = RepeatedMask[Offset + 1];
27179 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27180 return DAG.getUNDEF(MaskVT);
27181 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27182 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27183 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27184 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27185 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27186 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27187 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27189 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27190 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27191 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27198 int ShufMask[4] = {-1, -1, -1, -1};
27199 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27200 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27205 Shuffle = X86ISD::SHUFP;
27206 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27207 PermuteImm = getV4X86ShuffleImm(ShufMask);
27216 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27219 /// This is the leaf of the recursive combine below. When we have found some
27220 /// chain of single-use x86 shuffle instructions and accumulated the combined
27221 /// shuffle mask represented by them, this will try to pattern match that mask
27222 /// into either a single instruction if there is a special purpose instruction
27223 /// for this operation, or into a PSHUFB instruction which is a fully general
27224 /// instruction but should only be used to replace chains over a certain depth.
27225 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27226 ArrayRef<int> BaseMask, int Depth,
27227 bool HasVariableMask, SelectionDAG &DAG,
27228 TargetLowering::DAGCombinerInfo &DCI,
27229 const X86Subtarget &Subtarget) {
27230 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27231 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27232 "Unexpected number of shuffle inputs!");
27234 // Find the inputs that enter the chain. Note that multiple uses are OK
27235 // here, we're not going to remove the operands we find.
27236 bool UnaryShuffle = (Inputs.size() == 1);
27237 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27238 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27239 : peekThroughBitcasts(Inputs[1]));
27241 MVT VT1 = V1.getSimpleValueType();
27242 MVT VT2 = V2.getSimpleValueType();
27243 MVT RootVT = Root.getSimpleValueType();
27244 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27245 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27246 "Vector size mismatch");
27251 unsigned NumBaseMaskElts = BaseMask.size();
27252 if (NumBaseMaskElts == 1) {
27253 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27254 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27259 unsigned RootSizeInBits = RootVT.getSizeInBits();
27260 unsigned NumRootElts = RootVT.getVectorNumElements();
27261 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27262 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27263 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27265 // Don't combine if we are a AVX512/EVEX target and the mask element size
27266 // is different from the root element size - this would prevent writemasks
27267 // from being reused.
27268 // TODO - this currently prevents all lane shuffles from occurring.
27269 // TODO - check for writemasks usage instead of always preventing combining.
27270 // TODO - attempt to narrow Mask back to writemask size.
27271 bool IsEVEXShuffle =
27272 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27273 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27276 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27278 // Handle 128-bit lane shuffles of 256-bit vectors.
27279 // TODO - this should support binary shuffles.
27280 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27281 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27282 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27283 return false; // Nothing to do!
27284 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27285 unsigned PermMask = 0;
27286 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27287 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27289 Res = DAG.getBitcast(ShuffleVT, V1);
27290 DCI.AddToWorklist(Res.getNode());
27291 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27292 DAG.getUNDEF(ShuffleVT),
27293 DAG.getConstant(PermMask, DL, MVT::i8));
27294 DCI.AddToWorklist(Res.getNode());
27295 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27300 // For masks that have been widened to 128-bit elements or more,
27301 // narrow back down to 64-bit elements.
27302 SmallVector<int, 64> Mask;
27303 if (BaseMaskEltSizeInBits > 64) {
27304 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27305 int MaskScale = BaseMaskEltSizeInBits / 64;
27306 scaleShuffleMask(MaskScale, BaseMask, Mask);
27308 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27311 unsigned NumMaskElts = Mask.size();
27312 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27314 // Determine the effective mask value type.
27315 FloatDomain &= (32 <= MaskEltSizeInBits);
27316 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27317 : MVT::getIntegerVT(MaskEltSizeInBits);
27318 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27320 // Only allow legal mask types.
27321 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27324 // Attempt to match the mask against known shuffle patterns.
27325 MVT ShuffleSrcVT, ShuffleVT;
27326 unsigned Shuffle, PermuteImm;
27328 // Which shuffle domains are permitted?
27329 // Permit domain crossing at higher combine depths.
27330 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27331 bool AllowIntDomain = !FloatDomain || (Depth > 3);
27333 if (UnaryShuffle) {
27334 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27335 // directly if we don't shuffle the lower element and we shuffle the upper
27336 // (zero) elements within themselves.
27337 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27338 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27339 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27340 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27341 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27342 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27343 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27349 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27350 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27352 if (Depth == 1 && Root.getOpcode() == Shuffle)
27353 return false; // Nothing to do!
27354 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27355 return false; // AVX512 Writemask clash.
27356 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27357 DCI.AddToWorklist(Res.getNode());
27358 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27359 DCI.AddToWorklist(Res.getNode());
27360 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27365 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27366 AllowIntDomain, Subtarget, Shuffle,
27367 ShuffleVT, PermuteImm)) {
27368 if (Depth == 1 && Root.getOpcode() == Shuffle)
27369 return false; // Nothing to do!
27370 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27371 return false; // AVX512 Writemask clash.
27372 Res = DAG.getBitcast(ShuffleVT, V1);
27373 DCI.AddToWorklist(Res.getNode());
27374 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27375 DAG.getConstant(PermuteImm, DL, MVT::i8));
27376 DCI.AddToWorklist(Res.getNode());
27377 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27383 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27384 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27386 if (Depth == 1 && Root.getOpcode() == Shuffle)
27387 return false; // Nothing to do!
27388 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27389 return false; // AVX512 Writemask clash.
27390 V1 = DAG.getBitcast(ShuffleVT, V1);
27391 DCI.AddToWorklist(V1.getNode());
27392 V2 = DAG.getBitcast(ShuffleVT, V2);
27393 DCI.AddToWorklist(V2.getNode());
27394 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27395 DCI.AddToWorklist(Res.getNode());
27396 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27401 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27402 AllowIntDomain, V1, V2, DL, DAG,
27403 Subtarget, Shuffle, ShuffleVT,
27405 if (Depth == 1 && Root.getOpcode() == Shuffle)
27406 return false; // Nothing to do!
27407 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27408 return false; // AVX512 Writemask clash.
27409 V1 = DAG.getBitcast(ShuffleVT, V1);
27410 DCI.AddToWorklist(V1.getNode());
27411 V2 = DAG.getBitcast(ShuffleVT, V2);
27412 DCI.AddToWorklist(V2.getNode());
27413 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27414 DAG.getConstant(PermuteImm, DL, MVT::i8));
27415 DCI.AddToWorklist(Res.getNode());
27416 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27421 // Don't try to re-form single instruction chains under any circumstances now
27422 // that we've done encoding canonicalization for them.
27426 bool MaskContainsZeros =
27427 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27429 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27430 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27431 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27432 ((Subtarget.hasAVX2() &&
27433 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27434 (Subtarget.hasAVX512() &&
27435 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27436 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27437 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27438 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27439 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27440 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27441 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27442 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27443 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27444 DCI.AddToWorklist(VPermMask.getNode());
27445 Res = DAG.getBitcast(MaskVT, V1);
27446 DCI.AddToWorklist(Res.getNode());
27447 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27448 DCI.AddToWorklist(Res.getNode());
27449 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27454 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27455 // vector as the second source.
27456 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27457 ((Subtarget.hasAVX512() &&
27458 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27459 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27460 (Subtarget.hasVLX() &&
27461 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27462 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27463 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27464 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27465 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27466 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27467 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27468 for (unsigned i = 0; i != NumMaskElts; ++i)
27469 if (Mask[i] == SM_SentinelZero)
27470 Mask[i] = NumMaskElts + i;
27472 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27473 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27474 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27475 DCI.AddToWorklist(VPermMask.getNode());
27476 Res = DAG.getBitcast(MaskVT, V1);
27477 DCI.AddToWorklist(Res.getNode());
27478 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27479 DCI.AddToWorklist(Zero.getNode());
27480 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27481 DCI.AddToWorklist(Res.getNode());
27482 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27487 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27488 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27489 ((Subtarget.hasAVX512() &&
27490 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27491 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27492 (Subtarget.hasVLX() &&
27493 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27494 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27495 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27496 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27497 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27498 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27499 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27500 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27501 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27502 DCI.AddToWorklist(VPermMask.getNode());
27503 V1 = DAG.getBitcast(MaskVT, V1);
27504 DCI.AddToWorklist(V1.getNode());
27505 V2 = DAG.getBitcast(MaskVT, V2);
27506 DCI.AddToWorklist(V2.getNode());
27507 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27508 DCI.AddToWorklist(Res.getNode());
27509 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27516 // See if we can combine a single input shuffle with zeros to a bit-mask,
27517 // which is much simpler than any shuffle.
27518 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27519 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27520 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27521 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27522 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27523 APInt UndefElts(NumMaskElts, 0);
27524 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27525 for (unsigned i = 0; i != NumMaskElts; ++i) {
27527 if (M == SM_SentinelUndef) {
27528 UndefElts.setBit(i);
27531 if (M == SM_SentinelZero)
27533 EltBits[i] = AllOnes;
27535 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27536 DCI.AddToWorklist(BitMask.getNode());
27537 Res = DAG.getBitcast(MaskVT, V1);
27538 DCI.AddToWorklist(Res.getNode());
27539 unsigned AndOpcode =
27540 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27541 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27542 DCI.AddToWorklist(Res.getNode());
27543 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27548 // If we have a single input shuffle with different shuffle patterns in the
27549 // the 128-bit lanes use the variable mask to VPERMILPS.
27550 // TODO Combine other mask types at higher depths.
27551 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27552 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27553 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27554 SmallVector<SDValue, 16> VPermIdx;
27555 for (int M : Mask) {
27557 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27558 VPermIdx.push_back(Idx);
27560 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27561 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27562 DCI.AddToWorklist(VPermMask.getNode());
27563 Res = DAG.getBitcast(MaskVT, V1);
27564 DCI.AddToWorklist(Res.getNode());
27565 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27566 DCI.AddToWorklist(Res.getNode());
27567 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27572 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27573 // to VPERMIL2PD/VPERMIL2PS.
27574 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27575 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27576 MaskVT == MVT::v8f32)) {
27577 // VPERMIL2 Operation.
27578 // Bits[3] - Match Bit.
27579 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27580 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27581 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27582 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27583 SmallVector<int, 8> VPerm2Idx;
27584 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27585 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27586 unsigned M2ZImm = 0;
27587 for (int M : Mask) {
27588 if (M == SM_SentinelUndef) {
27589 VPerm2Idx.push_back(-1);
27592 if (M == SM_SentinelZero) {
27594 VPerm2Idx.push_back(8);
27597 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27598 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27599 VPerm2Idx.push_back(Index);
27601 V1 = DAG.getBitcast(MaskVT, V1);
27602 DCI.AddToWorklist(V1.getNode());
27603 V2 = DAG.getBitcast(MaskVT, V2);
27604 DCI.AddToWorklist(V2.getNode());
27605 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27606 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27607 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27608 DAG.getConstant(M2ZImm, DL, MVT::i8));
27609 DCI.AddToWorklist(Res.getNode());
27610 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27615 // If we have 3 or more shuffle instructions or a chain involving a variable
27616 // mask, we can replace them with a single PSHUFB instruction profitably.
27617 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27618 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27619 // more aggressive.
27620 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27621 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27622 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27623 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27624 SmallVector<SDValue, 16> PSHUFBMask;
27625 int NumBytes = RootVT.getSizeInBits() / 8;
27626 int Ratio = NumBytes / NumMaskElts;
27627 for (int i = 0; i < NumBytes; ++i) {
27628 int M = Mask[i / Ratio];
27629 if (M == SM_SentinelUndef) {
27630 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27633 if (M == SM_SentinelZero) {
27634 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27637 M = Ratio * M + i % Ratio;
27638 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27639 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27641 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27642 Res = DAG.getBitcast(ByteVT, V1);
27643 DCI.AddToWorklist(Res.getNode());
27644 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27645 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27646 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27647 DCI.AddToWorklist(Res.getNode());
27648 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27653 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27654 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27655 // slower than PSHUFB on targets that support both.
27656 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27657 Subtarget.hasXOP()) {
27658 // VPPERM Mask Operation
27659 // Bits[4:0] - Byte Index (0 - 31)
27660 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27661 SmallVector<SDValue, 16> VPPERMMask;
27663 int Ratio = NumBytes / NumMaskElts;
27664 for (int i = 0; i < NumBytes; ++i) {
27665 int M = Mask[i / Ratio];
27666 if (M == SM_SentinelUndef) {
27667 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27670 if (M == SM_SentinelZero) {
27671 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27674 M = Ratio * M + i % Ratio;
27675 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27677 MVT ByteVT = MVT::v16i8;
27678 V1 = DAG.getBitcast(ByteVT, V1);
27679 DCI.AddToWorklist(V1.getNode());
27680 V2 = DAG.getBitcast(ByteVT, V2);
27681 DCI.AddToWorklist(V2.getNode());
27682 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27683 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27684 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27685 DCI.AddToWorklist(Res.getNode());
27686 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27691 // Failed to find any combines.
27695 // Attempt to constant fold all of the constant source ops.
27696 // Returns true if the entire shuffle is folded to a constant.
27697 // TODO: Extend this to merge multiple constant Ops and update the mask.
27698 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27699 ArrayRef<int> Mask, SDValue Root,
27700 bool HasVariableMask, SelectionDAG &DAG,
27701 TargetLowering::DAGCombinerInfo &DCI,
27702 const X86Subtarget &Subtarget) {
27703 MVT VT = Root.getSimpleValueType();
27705 unsigned SizeInBits = VT.getSizeInBits();
27706 unsigned NumMaskElts = Mask.size();
27707 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27708 unsigned NumOps = Ops.size();
27710 // Extract constant bits from each source op.
27711 bool OneUseConstantOp = false;
27712 SmallVector<APInt, 16> UndefEltsOps(NumOps);
27713 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27714 for (unsigned i = 0; i != NumOps; ++i) {
27715 SDValue SrcOp = Ops[i];
27716 OneUseConstantOp |= SrcOp.hasOneUse();
27717 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27722 // Only fold if at least one of the constants is only used once or
27723 // the combined shuffle has included a variable mask shuffle, this
27724 // is to avoid constant pool bloat.
27725 if (!OneUseConstantOp && !HasVariableMask)
27728 // Shuffle the constant bits according to the mask.
27729 APInt UndefElts(NumMaskElts, 0);
27730 APInt ZeroElts(NumMaskElts, 0);
27731 APInt ConstantElts(NumMaskElts, 0);
27732 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27733 APInt::getNullValue(MaskSizeInBits));
27734 for (unsigned i = 0; i != NumMaskElts; ++i) {
27736 if (M == SM_SentinelUndef) {
27737 UndefElts.setBit(i);
27739 } else if (M == SM_SentinelZero) {
27740 ZeroElts.setBit(i);
27743 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27745 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27746 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27748 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27749 if (SrcUndefElts[SrcMaskIdx]) {
27750 UndefElts.setBit(i);
27754 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27755 APInt &Bits = SrcEltBits[SrcMaskIdx];
27757 ZeroElts.setBit(i);
27761 ConstantElts.setBit(i);
27762 ConstantBitData[i] = Bits;
27764 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
27766 // Create the constant data.
27768 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27769 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27771 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27773 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27776 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27777 DCI.AddToWorklist(CstOp.getNode());
27778 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27782 /// \brief Fully generic combining of x86 shuffle instructions.
27784 /// This should be the last combine run over the x86 shuffle instructions. Once
27785 /// they have been fully optimized, this will recursively consider all chains
27786 /// of single-use shuffle instructions, build a generic model of the cumulative
27787 /// shuffle operation, and check for simpler instructions which implement this
27788 /// operation. We use this primarily for two purposes:
27790 /// 1) Collapse generic shuffles to specialized single instructions when
27791 /// equivalent. In most cases, this is just an encoding size win, but
27792 /// sometimes we will collapse multiple generic shuffles into a single
27793 /// special-purpose shuffle.
27794 /// 2) Look for sequences of shuffle instructions with 3 or more total
27795 /// instructions, and replace them with the slightly more expensive SSSE3
27796 /// PSHUFB instruction if available. We do this as the last combining step
27797 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27798 /// a suitable short sequence of other instructions. The PSHUFB will either
27799 /// use a register or have to read from memory and so is slightly (but only
27800 /// slightly) more expensive than the other shuffle instructions.
27802 /// Because this is inherently a quadratic operation (for each shuffle in
27803 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27804 /// This should never be an issue in practice as the shuffle lowering doesn't
27805 /// produce sequences of more than 8 instructions.
27807 /// FIXME: We will currently miss some cases where the redundant shuffling
27808 /// would simplify under the threshold for PSHUFB formation because of
27809 /// combine-ordering. To fix this, we should do the redundant instruction
27810 /// combining in this recursive walk.
27811 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27812 int SrcOpIndex, SDValue Root,
27813 ArrayRef<int> RootMask,
27814 ArrayRef<const SDNode*> SrcNodes,
27815 int Depth, bool HasVariableMask,
27817 TargetLowering::DAGCombinerInfo &DCI,
27818 const X86Subtarget &Subtarget) {
27819 // Bound the depth of our recursive combine because this is ultimately
27820 // quadratic in nature.
27824 // Directly rip through bitcasts to find the underlying operand.
27825 SDValue Op = SrcOps[SrcOpIndex];
27826 Op = peekThroughOneUseBitcasts(Op);
27828 MVT VT = Op.getSimpleValueType();
27829 if (!VT.isVector())
27830 return false; // Bail if we hit a non-vector.
27832 assert(Root.getSimpleValueType().isVector() &&
27833 "Shuffles operate on vector types!");
27834 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27835 "Can only combine shuffles of the same vector register size.");
27837 // Extract target shuffle mask and resolve sentinels and inputs.
27838 SmallVector<int, 64> OpMask;
27839 SmallVector<SDValue, 2> OpInputs;
27840 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
27843 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
27844 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
27845 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
27847 // Add the inputs to the Ops list, avoiding duplicates.
27848 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
27850 int InputIdx0 = -1, InputIdx1 = -1;
27851 for (int i = 0, e = Ops.size(); i < e; ++i) {
27852 SDValue BC = peekThroughBitcasts(Ops[i]);
27853 if (Input0 && BC == peekThroughBitcasts(Input0))
27855 if (Input1 && BC == peekThroughBitcasts(Input1))
27859 if (Input0 && InputIdx0 < 0) {
27860 InputIdx0 = SrcOpIndex;
27861 Ops[SrcOpIndex] = Input0;
27863 if (Input1 && InputIdx1 < 0) {
27864 InputIdx1 = Ops.size();
27865 Ops.push_back(Input1);
27868 assert(((RootMask.size() > OpMask.size() &&
27869 RootMask.size() % OpMask.size() == 0) ||
27870 (OpMask.size() > RootMask.size() &&
27871 OpMask.size() % RootMask.size() == 0) ||
27872 OpMask.size() == RootMask.size()) &&
27873 "The smaller number of elements must divide the larger.");
27874 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27875 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27876 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27877 assert(((RootRatio == 1 && OpRatio == 1) ||
27878 (RootRatio == 1) != (OpRatio == 1)) &&
27879 "Must not have a ratio for both incoming and op masks!");
27881 SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27883 // Merge this shuffle operation's mask into our accumulated mask. Note that
27884 // this shuffle's mask will be the first applied to the input, followed by the
27885 // root mask to get us all the way to the root value arrangement. The reason
27886 // for this order is that we are recursing up the operation chain.
27887 for (int i = 0; i < MaskWidth; ++i) {
27888 int RootIdx = i / RootRatio;
27889 if (RootMask[RootIdx] < 0) {
27890 // This is a zero or undef lane, we're done.
27891 Mask[i] = RootMask[RootIdx];
27895 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27897 // Just insert the scaled root mask value if it references an input other
27898 // than the SrcOp we're currently inserting.
27899 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27900 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27901 Mask[i] = RootMaskedIdx;
27905 RootMaskedIdx %= MaskWidth;
27907 int OpIdx = RootMaskedIdx / OpRatio;
27908 if (OpMask[OpIdx] < 0) {
27909 // The incoming lanes are zero or undef, it doesn't matter which ones we
27911 Mask[i] = OpMask[OpIdx];
27915 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27916 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27917 OpMaskedIdx %= MaskWidth;
27919 if (OpMask[OpIdx] < (int)OpMask.size()) {
27920 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27921 OpMaskedIdx += InputIdx0 * MaskWidth;
27923 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27924 OpMaskedIdx += InputIdx1 * MaskWidth;
27927 Mask[i] = OpMaskedIdx;
27930 // Handle the all undef/zero cases early.
27931 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27932 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27935 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27936 // TODO - should we handle the mixed zero/undef case as well? Just returning
27937 // a zero mask will lose information on undef elements possibly reducing
27938 // future combine possibilities.
27939 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27940 Subtarget, DAG, SDLoc(Root)));
27944 // Remove unused shuffle source ops.
27945 resolveTargetShuffleInputsAndMask(Ops, Mask);
27946 assert(!Ops.empty() && "Shuffle with no inputs detected");
27948 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27950 // Update the list of shuffle nodes that have been combined so far.
27951 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
27953 CombinedNodes.push_back(Op.getNode());
27955 // See if we can recurse into each shuffle source op (if it's a target
27956 // shuffle). The source op should only be combined if it either has a
27957 // single use (i.e. current Op) or all its users have already been combined.
27958 for (int i = 0, e = Ops.size(); i < e; ++i)
27959 if (Ops[i].getNode()->hasOneUse() ||
27960 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
27961 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
27962 Depth + 1, HasVariableMask, DAG, DCI,
27966 // Attempt to constant fold all of the constant source ops.
27967 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
27971 // We can only combine unary and binary shuffle mask cases.
27972 if (Ops.size() > 2)
27975 // Minor canonicalization of the accumulated shuffle mask to make it easier
27976 // to match below. All this does is detect masks with sequential pairs of
27977 // elements, and shrink them to the half-width mask. It does this in a loop
27978 // so it will reduce the size of the mask to the minimal width mask which
27979 // performs an equivalent shuffle.
27980 SmallVector<int, 64> WidenedMask;
27981 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
27982 Mask = std::move(WidenedMask);
27985 // Canonicalization of binary shuffle masks to improve pattern matching by
27986 // commuting the inputs.
27987 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
27988 ShuffleVectorSDNode::commuteMask(Mask);
27989 std::swap(Ops[0], Ops[1]);
27992 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
27996 /// \brief Get the PSHUF-style mask from PSHUF node.
27998 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
27999 /// PSHUF-style masks that can be reused with such instructions.
28000 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28001 MVT VT = N.getSimpleValueType();
28002 SmallVector<int, 4> Mask;
28003 SmallVector<SDValue, 2> Ops;
28006 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28010 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28011 // matter. Check that the upper masks are repeats and remove them.
28012 if (VT.getSizeInBits() > 128) {
28013 int LaneElts = 128 / VT.getScalarSizeInBits();
28015 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28016 for (int j = 0; j < LaneElts; ++j)
28017 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28018 "Mask doesn't repeat in high 128-bit lanes!");
28020 Mask.resize(LaneElts);
28023 switch (N.getOpcode()) {
28024 case X86ISD::PSHUFD:
28026 case X86ISD::PSHUFLW:
28029 case X86ISD::PSHUFHW:
28030 Mask.erase(Mask.begin(), Mask.begin() + 4);
28031 for (int &M : Mask)
28035 llvm_unreachable("No valid shuffle instruction found!");
28039 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28041 /// We walk up the chain and look for a combinable shuffle, skipping over
28042 /// shuffles that we could hoist this shuffle's transformation past without
28043 /// altering anything.
28045 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28046 SelectionDAG &DAG) {
28047 assert(N.getOpcode() == X86ISD::PSHUFD &&
28048 "Called with something other than an x86 128-bit half shuffle!");
28051 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28052 // of the shuffles in the chain so that we can form a fresh chain to replace
28054 SmallVector<SDValue, 8> Chain;
28055 SDValue V = N.getOperand(0);
28056 for (; V.hasOneUse(); V = V.getOperand(0)) {
28057 switch (V.getOpcode()) {
28059 return SDValue(); // Nothing combined!
28062 // Skip bitcasts as we always know the type for the target specific
28066 case X86ISD::PSHUFD:
28067 // Found another dword shuffle.
28070 case X86ISD::PSHUFLW:
28071 // Check that the low words (being shuffled) are the identity in the
28072 // dword shuffle, and the high words are self-contained.
28073 if (Mask[0] != 0 || Mask[1] != 1 ||
28074 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28077 Chain.push_back(V);
28080 case X86ISD::PSHUFHW:
28081 // Check that the high words (being shuffled) are the identity in the
28082 // dword shuffle, and the low words are self-contained.
28083 if (Mask[2] != 2 || Mask[3] != 3 ||
28084 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28087 Chain.push_back(V);
28090 case X86ISD::UNPCKL:
28091 case X86ISD::UNPCKH:
28092 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28093 // shuffle into a preceding word shuffle.
28094 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28095 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28098 // Search for a half-shuffle which we can combine with.
28099 unsigned CombineOp =
28100 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28101 if (V.getOperand(0) != V.getOperand(1) ||
28102 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28104 Chain.push_back(V);
28105 V = V.getOperand(0);
28107 switch (V.getOpcode()) {
28109 return SDValue(); // Nothing to combine.
28111 case X86ISD::PSHUFLW:
28112 case X86ISD::PSHUFHW:
28113 if (V.getOpcode() == CombineOp)
28116 Chain.push_back(V);
28120 V = V.getOperand(0);
28124 } while (V.hasOneUse());
28127 // Break out of the loop if we break out of the switch.
28131 if (!V.hasOneUse())
28132 // We fell out of the loop without finding a viable combining instruction.
28135 // Merge this node's mask and our incoming mask.
28136 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28137 for (int &M : Mask)
28139 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28140 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28142 // Rebuild the chain around this new shuffle.
28143 while (!Chain.empty()) {
28144 SDValue W = Chain.pop_back_val();
28146 if (V.getValueType() != W.getOperand(0).getValueType())
28147 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28149 switch (W.getOpcode()) {
28151 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28153 case X86ISD::UNPCKL:
28154 case X86ISD::UNPCKH:
28155 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28158 case X86ISD::PSHUFD:
28159 case X86ISD::PSHUFLW:
28160 case X86ISD::PSHUFHW:
28161 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28165 if (V.getValueType() != N.getValueType())
28166 V = DAG.getBitcast(N.getValueType(), V);
28168 // Return the new chain to replace N.
28172 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28175 /// We walk up the chain, skipping shuffles of the other half and looking
28176 /// through shuffles which switch halves trying to find a shuffle of the same
28177 /// pair of dwords.
28178 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28180 TargetLowering::DAGCombinerInfo &DCI) {
28182 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28183 "Called with something other than an x86 128-bit half shuffle!");
28185 unsigned CombineOpcode = N.getOpcode();
28187 // Walk up a single-use chain looking for a combinable shuffle.
28188 SDValue V = N.getOperand(0);
28189 for (; V.hasOneUse(); V = V.getOperand(0)) {
28190 switch (V.getOpcode()) {
28192 return false; // Nothing combined!
28195 // Skip bitcasts as we always know the type for the target specific
28199 case X86ISD::PSHUFLW:
28200 case X86ISD::PSHUFHW:
28201 if (V.getOpcode() == CombineOpcode)
28204 // Other-half shuffles are no-ops.
28207 // Break out of the loop if we break out of the switch.
28211 if (!V.hasOneUse())
28212 // We fell out of the loop without finding a viable combining instruction.
28215 // Combine away the bottom node as its shuffle will be accumulated into
28216 // a preceding shuffle.
28217 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28219 // Record the old value.
28222 // Merge this node's mask and our incoming mask (adjusted to account for all
28223 // the pshufd instructions encountered).
28224 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28225 for (int &M : Mask)
28227 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28228 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28230 // Check that the shuffles didn't cancel each other out. If not, we need to
28231 // combine to the new one.
28233 // Replace the combinable shuffle with the combined one, updating all users
28234 // so that we re-evaluate the chain here.
28235 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28240 /// \brief Try to combine x86 target specific shuffles.
28241 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28242 TargetLowering::DAGCombinerInfo &DCI,
28243 const X86Subtarget &Subtarget) {
28245 MVT VT = N.getSimpleValueType();
28246 SmallVector<int, 4> Mask;
28248 unsigned Opcode = N.getOpcode();
28250 case X86ISD::PSHUFD:
28251 case X86ISD::PSHUFLW:
28252 case X86ISD::PSHUFHW:
28253 Mask = getPSHUFShuffleMask(N);
28254 assert(Mask.size() == 4);
28256 case X86ISD::UNPCKL: {
28257 auto Op0 = N.getOperand(0);
28258 auto Op1 = N.getOperand(1);
28259 unsigned Opcode0 = Op0.getOpcode();
28260 unsigned Opcode1 = Op1.getOpcode();
28262 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28263 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28264 // TODO: Add other horizontal operations as required.
28265 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28266 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28268 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28269 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28270 // moves upper half elements into the lower half part. For example:
28272 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28274 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28276 // will be combined to:
28278 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28280 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28281 // happen due to advanced instructions.
28282 if (!VT.is128BitVector())
28285 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28286 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28288 unsigned NumElts = VT.getVectorNumElements();
28289 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28290 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28293 auto ShufOp = Op1.getOperand(0);
28294 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28295 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28299 case X86ISD::BLENDI: {
28300 SDValue V0 = N->getOperand(0);
28301 SDValue V1 = N->getOperand(1);
28302 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28303 "Unexpected input vector types");
28305 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28306 // operands and changing the mask to 1. This saves us a bunch of
28307 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28308 // x86InstrInfo knows how to commute this back after instruction selection
28309 // if it would help register allocation.
28311 // TODO: If optimizing for size or a processor that doesn't suffer from
28312 // partial register update stalls, this should be transformed into a MOVSD
28313 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28315 if (VT == MVT::v2f64)
28316 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28317 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28318 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28319 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28324 case X86ISD::MOVSD:
28325 case X86ISD::MOVSS: {
28326 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28327 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28328 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28329 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28330 if (isZero0 && isZero1)
28333 // We often lower to MOVSD/MOVSS from integer as well as native float
28334 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28335 // easier to combine shuffles later on. We've already accounted for the
28336 // domain switching cost when we decided to lower with it.
28337 bool isFloat = VT.isFloatingPoint();
28338 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28339 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28340 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28341 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28342 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28343 V0 = DAG.getBitcast(NewVT, V0);
28344 V1 = DAG.getBitcast(NewVT, V1);
28345 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28350 case X86ISD::INSERTPS: {
28351 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28352 SDValue Op0 = N.getOperand(0);
28353 SDValue Op1 = N.getOperand(1);
28354 SDValue Op2 = N.getOperand(2);
28355 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28356 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28357 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28358 unsigned ZeroMask = InsertPSMask & 0xF;
28360 // If we zero out all elements from Op0 then we don't need to reference it.
28361 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28362 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28363 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28365 // If we zero out the element from Op1 then we don't need to reference it.
28366 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28367 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28368 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28370 // Attempt to merge insertps Op1 with an inner target shuffle node.
28371 SmallVector<int, 8> TargetMask1;
28372 SmallVector<SDValue, 2> Ops1;
28373 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28374 int M = TargetMask1[SrcIdx];
28375 if (isUndefOrZero(M)) {
28376 // Zero/UNDEF insertion - zero out element and remove dependency.
28377 InsertPSMask |= (1u << DstIdx);
28378 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28379 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28381 // Update insertps mask srcidx and reference the source input directly.
28382 assert(0 <= M && M < 8 && "Shuffle index out of range");
28383 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28384 Op1 = Ops1[M < 4 ? 0 : 1];
28385 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28386 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28389 // Attempt to merge insertps Op0 with an inner target shuffle node.
28390 SmallVector<int, 8> TargetMask0;
28391 SmallVector<SDValue, 2> Ops0;
28392 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28395 bool Updated = false;
28396 bool UseInput00 = false;
28397 bool UseInput01 = false;
28398 for (int i = 0; i != 4; ++i) {
28399 int M = TargetMask0[i];
28400 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28401 // No change if element is already zero or the inserted element.
28403 } else if (isUndefOrZero(M)) {
28404 // If the target mask is undef/zero then we must zero the element.
28405 InsertPSMask |= (1u << i);
28410 // The input vector element must be inline.
28411 if (M != i && M != (i + 4))
28414 // Determine which inputs of the target shuffle we're using.
28415 UseInput00 |= (0 <= M && M < 4);
28416 UseInput01 |= (4 <= M);
28419 // If we're not using both inputs of the target shuffle then use the
28420 // referenced input directly.
28421 if (UseInput00 && !UseInput01) {
28424 } else if (!UseInput00 && UseInput01) {
28430 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28431 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28439 // Nuke no-op shuffles that show up after combining.
28440 if (isNoopShuffleMask(Mask))
28441 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28443 // Look for simplifications involving one or two shuffle instructions.
28444 SDValue V = N.getOperand(0);
28445 switch (N.getOpcode()) {
28448 case X86ISD::PSHUFLW:
28449 case X86ISD::PSHUFHW:
28450 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28452 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28453 return SDValue(); // We combined away this shuffle, so we're done.
28455 // See if this reduces to a PSHUFD which is no more expensive and can
28456 // combine with more operations. Note that it has to at least flip the
28457 // dwords as otherwise it would have been removed as a no-op.
28458 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28459 int DMask[] = {0, 1, 2, 3};
28460 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28461 DMask[DOffset + 0] = DOffset + 1;
28462 DMask[DOffset + 1] = DOffset + 0;
28463 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28464 V = DAG.getBitcast(DVT, V);
28465 DCI.AddToWorklist(V.getNode());
28466 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28467 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28468 DCI.AddToWorklist(V.getNode());
28469 return DAG.getBitcast(VT, V);
28472 // Look for shuffle patterns which can be implemented as a single unpack.
28473 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28474 // only works when we have a PSHUFD followed by two half-shuffles.
28475 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28476 (V.getOpcode() == X86ISD::PSHUFLW ||
28477 V.getOpcode() == X86ISD::PSHUFHW) &&
28478 V.getOpcode() != N.getOpcode() &&
28480 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28481 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28482 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28483 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28484 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28485 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28487 for (int i = 0; i < 4; ++i) {
28488 WordMask[i + NOffset] = Mask[i] + NOffset;
28489 WordMask[i + VOffset] = VMask[i] + VOffset;
28491 // Map the word mask through the DWord mask.
28493 for (int i = 0; i < 8; ++i)
28494 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28495 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28496 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28497 // We can replace all three shuffles with an unpack.
28498 V = DAG.getBitcast(VT, D.getOperand(0));
28499 DCI.AddToWorklist(V.getNode());
28500 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28509 case X86ISD::PSHUFD:
28510 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28519 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28520 /// operation. If true is returned then the operands of ADDSUB operation
28521 /// are written to the parameters \p Opnd0 and \p Opnd1.
28523 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28524 /// so it is easier to generically match. We also insert dummy vector shuffle
28525 /// nodes for the operands which explicitly discard the lanes which are unused
28526 /// by this operation to try to flow through the rest of the combiner
28527 /// the fact that they're unused.
28528 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28529 SDValue &Opnd0, SDValue &Opnd1) {
28531 EVT VT = N->getValueType(0);
28532 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28533 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28534 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28537 // We only handle target-independent shuffles.
28538 // FIXME: It would be easy and harmless to use the target shuffle mask
28539 // extraction tool to support more.
28540 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28543 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28544 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28546 SDValue V1 = N->getOperand(0);
28547 SDValue V2 = N->getOperand(1);
28549 // We require the first shuffle operand to be the FSUB node, and the second to
28550 // be the FADD node.
28551 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28552 ShuffleVectorSDNode::commuteMask(Mask);
28554 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28557 // If there are other uses of these operations we can't fold them.
28558 if (!V1->hasOneUse() || !V2->hasOneUse())
28561 // Ensure that both operations have the same operands. Note that we can
28562 // commute the FADD operands.
28563 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28564 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28565 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28568 // We're looking for blends between FADD and FSUB nodes. We insist on these
28569 // nodes being lined up in a specific expected pattern.
28570 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28571 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28572 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28573 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28574 8, 25, 10, 27, 12, 29, 14, 31})))
28582 /// \brief Try to combine a shuffle into a target-specific add-sub or
28583 /// mul-add-sub node.
28584 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28585 const X86Subtarget &Subtarget,
28586 SelectionDAG &DAG) {
28587 SDValue Opnd0, Opnd1;
28588 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28591 EVT VT = N->getValueType(0);
28594 // Try to generate X86ISD::FMADDSUB node here.
28596 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28597 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28599 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28600 // the ADDSUB idiom has been successfully recognized. There are no known
28601 // X86 targets with 512-bit ADDSUB instructions!
28602 if (VT.is512BitVector())
28605 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28608 // We are looking for a shuffle where both sources are concatenated with undef
28609 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28610 // if we can express this as a single-source shuffle, that's preferable.
28611 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28612 const X86Subtarget &Subtarget) {
28613 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28616 EVT VT = N->getValueType(0);
28618 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28619 if (!VT.is128BitVector() && !VT.is256BitVector())
28622 if (VT.getVectorElementType() != MVT::i32 &&
28623 VT.getVectorElementType() != MVT::i64 &&
28624 VT.getVectorElementType() != MVT::f32 &&
28625 VT.getVectorElementType() != MVT::f64)
28628 SDValue N0 = N->getOperand(0);
28629 SDValue N1 = N->getOperand(1);
28631 // Check that both sources are concats with undef.
28632 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28633 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28634 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28635 !N1.getOperand(1).isUndef())
28638 // Construct the new shuffle mask. Elements from the first source retain their
28639 // index, but elements from the second source no longer need to skip an undef.
28640 SmallVector<int, 8> Mask;
28641 int NumElts = VT.getVectorNumElements();
28643 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28644 for (int Elt : SVOp->getMask())
28645 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28648 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28650 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28653 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28654 TargetLowering::DAGCombinerInfo &DCI,
28655 const X86Subtarget &Subtarget) {
28657 EVT VT = N->getValueType(0);
28658 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28659 // If we have legalized the vector types, look for blends of FADD and FSUB
28660 // nodes that we can fuse into an ADDSUB node.
28661 if (TLI.isTypeLegal(VT))
28662 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28665 // During Type Legalization, when promoting illegal vector types,
28666 // the backend might introduce new shuffle dag nodes and bitcasts.
28668 // This code performs the following transformation:
28669 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28670 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28672 // We do this only if both the bitcast and the BINOP dag nodes have
28673 // one use. Also, perform this transformation only if the new binary
28674 // operation is legal. This is to avoid introducing dag nodes that
28675 // potentially need to be further expanded (or custom lowered) into a
28676 // less optimal sequence of dag nodes.
28677 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28678 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28679 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28680 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28681 SDValue N0 = N->getOperand(0);
28682 SDValue N1 = N->getOperand(1);
28684 SDValue BC0 = N0.getOperand(0);
28685 EVT SVT = BC0.getValueType();
28686 unsigned Opcode = BC0.getOpcode();
28687 unsigned NumElts = VT.getVectorNumElements();
28689 if (BC0.hasOneUse() && SVT.isVector() &&
28690 SVT.getVectorNumElements() * 2 == NumElts &&
28691 TLI.isOperationLegal(Opcode, VT)) {
28692 bool CanFold = false;
28698 // isOperationLegal lies for integer ops on floating point types.
28699 CanFold = VT.isInteger();
28704 // isOperationLegal lies for floating point ops on integer types.
28705 CanFold = VT.isFloatingPoint();
28709 unsigned SVTNumElts = SVT.getVectorNumElements();
28710 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28711 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28712 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28713 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28714 CanFold = SVOp->getMaskElt(i) < 0;
28717 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28718 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28719 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28720 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28725 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28726 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28727 // consecutive, non-overlapping, and in the right order.
28728 SmallVector<SDValue, 16> Elts;
28729 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
28730 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
28731 Elts.push_back(Elt);
28738 if (Elts.size() == VT.getVectorNumElements())
28739 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28742 // For AVX2, we sometimes want to combine
28743 // (vector_shuffle <mask> (concat_vectors t1, undef)
28744 // (concat_vectors t2, undef))
28746 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28747 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28748 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28751 if (isTargetShuffle(N->getOpcode())) {
28753 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28756 // Try recursively combining arbitrary sequences of x86 shuffle
28757 // instructions into higher-order shuffles. We do this after combining
28758 // specific PSHUF instruction sequences into their minimal form so that we
28759 // can evaluate how many specialized shuffle instructions are involved in
28760 // a particular chain.
28761 SmallVector<int, 1> NonceMask; // Just a placeholder.
28762 NonceMask.push_back(0);
28763 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
28764 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28766 return SDValue(); // This routine will use CombineTo to replace N.
28772 /// Check if a vector extract from a target-specific shuffle of a load can be
28773 /// folded into a single element load.
28774 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28775 /// shuffles have been custom lowered so we need to handle those here.
28776 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28777 TargetLowering::DAGCombinerInfo &DCI) {
28778 if (DCI.isBeforeLegalizeOps())
28781 SDValue InVec = N->getOperand(0);
28782 SDValue EltNo = N->getOperand(1);
28783 EVT EltVT = N->getValueType(0);
28785 if (!isa<ConstantSDNode>(EltNo))
28788 EVT OriginalVT = InVec.getValueType();
28790 // Peek through bitcasts, don't duplicate a load with other uses.
28791 InVec = peekThroughOneUseBitcasts(InVec);
28793 EVT CurrentVT = InVec.getValueType();
28794 if (!CurrentVT.isVector() ||
28795 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28798 if (!isTargetShuffle(InVec.getOpcode()))
28801 // Don't duplicate a load with other uses.
28802 if (!InVec.hasOneUse())
28805 SmallVector<int, 16> ShuffleMask;
28806 SmallVector<SDValue, 2> ShuffleOps;
28808 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28809 ShuffleOps, ShuffleMask, UnaryShuffle))
28812 // Select the input vector, guarding against out of range extract vector.
28813 unsigned NumElems = CurrentVT.getVectorNumElements();
28814 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28815 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28817 if (Idx == SM_SentinelZero)
28818 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28819 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28820 if (Idx == SM_SentinelUndef)
28821 return DAG.getUNDEF(EltVT);
28823 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28824 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28827 // If inputs to shuffle are the same for both ops, then allow 2 uses
28828 unsigned AllowedUses =
28829 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28831 if (LdNode.getOpcode() == ISD::BITCAST) {
28832 // Don't duplicate a load with other uses.
28833 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28836 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28837 LdNode = LdNode.getOperand(0);
28840 if (!ISD::isNormalLoad(LdNode.getNode()))
28843 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28845 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28848 // If there's a bitcast before the shuffle, check if the load type and
28849 // alignment is valid.
28850 unsigned Align = LN0->getAlignment();
28851 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28852 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28853 EltVT.getTypeForEVT(*DAG.getContext()));
28855 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28858 // All checks match so transform back to vector_shuffle so that DAG combiner
28859 // can finish the job
28862 // Create shuffle node taking into account the case that its a unary shuffle
28863 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28864 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28866 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28867 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28871 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28872 const X86Subtarget &Subtarget) {
28873 SDValue N0 = N->getOperand(0);
28874 EVT VT = N->getValueType(0);
28875 EVT SrcVT = N0.getValueType();
28877 // Since MMX types are special and don't usually play with other vector types,
28878 // it's better to handle them early to be sure we emit efficient code by
28879 // avoiding store-load conversions.
28881 // Detect bitcasts between i32 to x86mmx low word.
28882 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28883 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
28884 SDValue N00 = N0->getOperand(0);
28885 if (N00.getValueType() == MVT::i32)
28886 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28889 // Detect bitcasts between element or subvector extraction to x86mmx.
28890 if (VT == MVT::x86mmx &&
28891 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
28892 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
28893 isNullConstant(N0.getOperand(1))) {
28894 SDValue N00 = N0->getOperand(0);
28895 if (N00.getValueType().is128BitVector())
28896 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
28897 DAG.getBitcast(MVT::v2i64, N00));
28900 // Detect bitcasts from FP_TO_SINT to x86mmx.
28901 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
28902 N0.getOpcode() == ISD::FP_TO_SINT) {
28904 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
28905 DAG.getUNDEF(MVT::v2i32));
28906 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
28907 DAG.getBitcast(MVT::v2i64, Res));
28910 // Convert a bitcasted integer logic operation that has one bitcasted
28911 // floating-point operand into a floating-point logic operation. This may
28912 // create a load of a constant, but that is cheaper than materializing the
28913 // constant in an integer register and transferring it to an SSE register or
28914 // transferring the SSE operand to integer register and back.
28916 switch (N0.getOpcode()) {
28917 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28918 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28919 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28920 default: return SDValue();
28923 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
28924 (Subtarget.hasSSE2() && VT == MVT::f64)))
28927 SDValue LogicOp0 = N0.getOperand(0);
28928 SDValue LogicOp1 = N0.getOperand(1);
28931 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
28932 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
28933 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
28934 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
28935 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
28936 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
28938 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
28939 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
28940 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
28941 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
28942 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
28943 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
28949 // Match a binop + shuffle pyramid that represents a horizontal reduction over
28950 // the elements of a vector.
28951 // Returns the vector that is being reduced on, or SDValue() if a reduction
28952 // was not matched.
28953 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
28954 // The pattern must end in an extract from index 0.
28955 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
28956 !isNullConstant(Extract->getOperand(1)))
28960 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
28962 SDValue Op = Extract->getOperand(0);
28963 // At each stage, we're looking for something that looks like:
28964 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
28965 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
28966 // i32 undef, i32 undef, i32 undef, i32 undef>
28967 // %a = binop <8 x i32> %op, %s
28968 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
28969 // we expect something like:
28970 // <4,5,6,7,u,u,u,u>
28971 // <2,3,u,u,u,u,u,u>
28972 // <1,u,u,u,u,u,u,u>
28973 for (unsigned i = 0; i < Stages; ++i) {
28974 if (Op.getOpcode() != BinOp)
28977 ShuffleVectorSDNode *Shuffle =
28978 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
28980 Op = Op.getOperand(1);
28982 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
28983 Op = Op.getOperand(0);
28986 // The first operand of the shuffle should be the same as the other operand
28988 if (!Shuffle || (Shuffle->getOperand(0) != Op))
28991 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
28992 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
28993 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29000 // Given a select, detect the following pattern:
29001 // 1: %2 = zext <N x i8> %0 to <N x i32>
29002 // 2: %3 = zext <N x i8> %1 to <N x i32>
29003 // 3: %4 = sub nsw <N x i32> %2, %3
29004 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29005 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29006 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29007 // This is useful as it is the input into a SAD pattern.
29008 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29010 // Check the condition of the select instruction is greater-than.
29011 SDValue SetCC = Select->getOperand(0);
29012 if (SetCC.getOpcode() != ISD::SETCC)
29014 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29015 if (CC != ISD::SETGT && CC != ISD::SETLT)
29018 SDValue SelectOp1 = Select->getOperand(1);
29019 SDValue SelectOp2 = Select->getOperand(2);
29021 // The following instructions assume SelectOp1 is the subtraction operand
29022 // and SelectOp2 is the negation operand.
29023 // In the case of SETLT this is the other way around.
29024 if (CC == ISD::SETLT)
29025 std::swap(SelectOp1, SelectOp2);
29027 // The second operand of the select should be the negation of the first
29028 // operand, which is implemented as 0 - SelectOp1.
29029 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29030 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29031 SelectOp2.getOperand(1) == SelectOp1))
29034 // The first operand of SetCC is the first operand of the select, which is the
29035 // difference between the two input vectors.
29036 if (SetCC.getOperand(0) != SelectOp1)
29039 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29041 if ((CC == ISD::SETLT) &&
29042 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29044 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29047 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29048 if ((CC == ISD::SETGT) &&
29049 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29050 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29053 // The first operand of the select is the difference between the two input
29055 if (SelectOp1.getOpcode() != ISD::SUB)
29058 Op0 = SelectOp1.getOperand(0);
29059 Op1 = SelectOp1.getOperand(1);
29061 // Check if the operands of the sub are zero-extended from vectors of i8.
29062 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29063 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29064 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29065 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29071 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29073 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29074 const SDValue &Zext1, const SDLoc &DL) {
29076 // Find the appropriate width for the PSADBW.
29077 EVT InVT = Zext0.getOperand(0).getValueType();
29078 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29080 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29081 // fill in the missing vector elements with 0.
29082 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29083 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29084 Ops[0] = Zext0.getOperand(0);
29085 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29086 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29087 Ops[0] = Zext1.getOperand(0);
29088 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29090 // Actually build the SAD
29091 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29092 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29095 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29096 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29098 const X86Subtarget &Subtarget) {
29099 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29100 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29103 EVT ExtractVT = Extract->getValueType(0);
29104 unsigned BitWidth = ExtractVT.getSizeInBits();
29105 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29106 ExtractVT != MVT::i8)
29109 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29110 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29111 SDValue Match = matchBinOpReduction(Extract, Op);
29115 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29116 // which we can't support here for now.
29117 if (Match.getScalarValueSizeInBits() != BitWidth)
29120 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29121 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29122 if (!(MatchSizeInBits == 128 ||
29123 (MatchSizeInBits == 256 &&
29124 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29127 // Don't bother performing this for 2-element vectors.
29128 if (Match.getValueType().getVectorNumElements() <= 2)
29131 // Check that we are extracting a reduction of all sign bits.
29132 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29135 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29137 if (64 == BitWidth || 32 == BitWidth)
29138 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29139 MatchSizeInBits / BitWidth);
29141 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29144 ISD::CondCode CondCode;
29145 if (Op == ISD::OR) {
29146 // any_of -> MOVMSK != 0
29147 CompareBits = APInt::getNullValue(32);
29148 CondCode = ISD::CondCode::SETNE;
29150 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29151 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29152 CondCode = ISD::CondCode::SETEQ;
29155 // Perform the select as i32/i64 and then truncate to avoid partial register
29157 unsigned ResWidth = std::max(BitWidth, 32u);
29158 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29160 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29161 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29162 SDValue Res = DAG.getBitcast(MaskVT, Match);
29163 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29164 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29165 Ones, Zero, CondCode);
29166 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29172 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29173 const X86Subtarget &Subtarget) {
29174 // PSADBW is only supported on SSE2 and up.
29175 if (!Subtarget.hasSSE2())
29178 // Verify the type we're extracting from is any integer type above i16.
29179 EVT VT = Extract->getOperand(0).getValueType();
29180 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29183 unsigned RegSize = 128;
29184 if (Subtarget.hasBWI())
29186 else if (Subtarget.hasAVX2())
29189 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29190 // TODO: We should be able to handle larger vectors by splitting them before
29191 // feeding them into several SADs, and then reducing over those.
29192 if (RegSize / VT.getVectorNumElements() < 8)
29195 // Match shuffle + add pyramid.
29196 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29198 // The operand is expected to be zero extended from i8
29199 // (verified in detectZextAbsDiff).
29200 // In order to convert to i64 and above, additional any/zero/sign
29201 // extend is expected.
29202 // The zero extend from 32 bit has no mathematical effect on the result.
29203 // Also the sign extend is basically zero extend
29204 // (extends the sign bit which is zero).
29205 // So it is correct to skip the sign/zero extend instruction.
29206 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29207 Root.getOpcode() == ISD::ZERO_EXTEND ||
29208 Root.getOpcode() == ISD::ANY_EXTEND))
29209 Root = Root.getOperand(0);
29211 // If there was a match, we want Root to be a select that is the root of an
29212 // abs-diff pattern.
29213 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29216 // Check whether we have an abs-diff pattern feeding into the select.
29217 SDValue Zext0, Zext1;
29218 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29221 // Create the SAD instruction.
29223 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29225 // If the original vector was wider than 8 elements, sum over the results
29226 // in the SAD vector.
29227 unsigned Stages = Log2_32(VT.getVectorNumElements());
29228 MVT SadVT = SAD.getSimpleValueType();
29230 unsigned SadElems = SadVT.getVectorNumElements();
29232 for(unsigned i = Stages - 3; i > 0; --i) {
29233 SmallVector<int, 16> Mask(SadElems, -1);
29234 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29235 Mask[j] = MaskEnd + j;
29238 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29239 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29243 MVT Type = Extract->getSimpleValueType(0);
29244 unsigned TypeSizeInBits = Type.getSizeInBits();
29245 // Return the lowest TypeSizeInBits bits.
29246 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29247 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29248 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29249 Extract->getOperand(1));
29252 // Attempt to peek through a target shuffle and extract the scalar from the
29254 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29255 TargetLowering::DAGCombinerInfo &DCI,
29256 const X86Subtarget &Subtarget) {
29257 if (DCI.isBeforeLegalizeOps())
29260 SDValue Src = N->getOperand(0);
29261 SDValue Idx = N->getOperand(1);
29263 EVT VT = N->getValueType(0);
29264 EVT SrcVT = Src.getValueType();
29265 EVT SrcSVT = SrcVT.getVectorElementType();
29266 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29268 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29269 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29272 // Resolve the target shuffle inputs and mask.
29273 SmallVector<int, 16> Mask;
29274 SmallVector<SDValue, 2> Ops;
29275 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
29278 // Attempt to narrow/widen the shuffle mask to the correct size.
29279 if (Mask.size() != NumSrcElts) {
29280 if ((NumSrcElts % Mask.size()) == 0) {
29281 SmallVector<int, 16> ScaledMask;
29282 int Scale = NumSrcElts / Mask.size();
29283 scaleShuffleMask(Scale, Mask, ScaledMask);
29284 Mask = std::move(ScaledMask);
29285 } else if ((Mask.size() % NumSrcElts) == 0) {
29286 SmallVector<int, 16> WidenedMask;
29287 while (Mask.size() > NumSrcElts &&
29288 canWidenShuffleElements(Mask, WidenedMask))
29289 Mask = std::move(WidenedMask);
29290 // TODO - investigate support for wider shuffle masks with known upper
29291 // undef/zero elements for implicit zero-extension.
29295 // Check if narrowing/widening failed.
29296 if (Mask.size() != NumSrcElts)
29299 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29302 // If the shuffle source element is undef/zero then we can just accept it.
29303 if (SrcIdx == SM_SentinelUndef)
29304 return DAG.getUNDEF(VT);
29306 if (SrcIdx == SM_SentinelZero)
29307 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29308 : DAG.getConstant(0, dl, VT);
29310 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29311 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29312 SrcIdx = SrcIdx % Mask.size();
29314 // We can only extract other elements from 128-bit vectors and in certain
29315 // circumstances, depending on SSE-level.
29316 // TODO: Investigate using extract_subvector for larger vectors.
29317 // TODO: Investigate float/double extraction if it will be just stored.
29318 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29319 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29320 assert(SrcSVT == VT && "Unexpected extraction type");
29321 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29322 DAG.getIntPtrConstant(SrcIdx, dl));
29325 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29326 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29327 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29328 "Unexpected extraction type");
29329 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29330 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29331 DAG.getIntPtrConstant(SrcIdx, dl));
29332 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29333 DAG.getValueType(SrcSVT));
29334 return DAG.getZExtOrTrunc(Assert, dl, VT);
29340 /// Detect vector gather/scatter index generation and convert it from being a
29341 /// bunch of shuffles and extracts into a somewhat faster sequence.
29342 /// For i686, the best sequence is apparently storing the value and loading
29343 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29344 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29345 TargetLowering::DAGCombinerInfo &DCI,
29346 const X86Subtarget &Subtarget) {
29347 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29350 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29353 SDValue InputVector = N->getOperand(0);
29354 SDValue EltIdx = N->getOperand(1);
29356 EVT SrcVT = InputVector.getValueType();
29357 EVT VT = N->getValueType(0);
29358 SDLoc dl(InputVector);
29360 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29361 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29362 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29363 SDValue MMXSrc = InputVector.getOperand(0);
29365 // The bitcast source is a direct mmx result.
29366 if (MMXSrc.getValueType() == MVT::x86mmx)
29367 return DAG.getBitcast(VT, InputVector);
29370 // Detect mmx to i32 conversion through a v2i32 elt extract.
29371 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29372 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29373 SDValue MMXSrc = InputVector.getOperand(0);
29375 // The bitcast source is a direct mmx result.
29376 if (MMXSrc.getValueType() == MVT::x86mmx)
29377 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29380 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29381 isa<ConstantSDNode>(EltIdx) &&
29382 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29383 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29384 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29385 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29386 return DAG.getConstant(Res, dl, MVT::i1);
29389 // Check whether this extract is the root of a sum of absolute differences
29390 // pattern. This has to be done here because we really want it to happen
29391 // pre-legalization,
29392 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29395 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29396 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29399 // Only operate on vectors of 4 elements, where the alternative shuffling
29400 // gets to be more expensive.
29401 if (SrcVT != MVT::v4i32)
29404 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29405 // single use which is a sign-extend or zero-extend, and all elements are
29407 SmallVector<SDNode *, 4> Uses;
29408 unsigned ExtractedElements = 0;
29409 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29410 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29411 if (UI.getUse().getResNo() != InputVector.getResNo())
29414 SDNode *Extract = *UI;
29415 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29418 if (Extract->getValueType(0) != MVT::i32)
29420 if (!Extract->hasOneUse())
29422 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29423 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29425 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29428 // Record which element was extracted.
29429 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29430 Uses.push_back(Extract);
29433 // If not all the elements were used, this may not be worthwhile.
29434 if (ExtractedElements != 15)
29437 // Ok, we've now decided to do the transformation.
29438 // If 64-bit shifts are legal, use the extract-shift sequence,
29439 // otherwise bounce the vector off the cache.
29440 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29443 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29444 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29445 auto &DL = DAG.getDataLayout();
29446 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29447 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29448 DAG.getConstant(0, dl, VecIdxTy));
29449 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29450 DAG.getConstant(1, dl, VecIdxTy));
29452 SDValue ShAmt = DAG.getConstant(
29453 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29454 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29455 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29456 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29457 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29458 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29459 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29461 // Store the value to a temporary stack slot.
29462 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29463 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29464 MachinePointerInfo());
29466 EVT ElementType = SrcVT.getVectorElementType();
29467 unsigned EltSize = ElementType.getSizeInBits() / 8;
29469 // Replace each use (extract) with a load of the appropriate element.
29470 for (unsigned i = 0; i < 4; ++i) {
29471 uint64_t Offset = EltSize * i;
29472 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29473 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29475 SDValue ScalarAddr =
29476 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29478 // Load the scalar.
29480 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29484 // Replace the extracts
29485 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29486 UE = Uses.end(); UI != UE; ++UI) {
29487 SDNode *Extract = *UI;
29489 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29490 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29493 // The replacement was made in place; don't return anything.
29497 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29498 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29499 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29500 // combineBasicSADPattern.
29501 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29502 TargetLowering::DAGCombinerInfo &DCI,
29503 const X86Subtarget &Subtarget) {
29504 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29507 /// If a vector select has an operand that is -1 or 0, try to simplify the
29508 /// select to a bitwise logic operation.
29510 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29511 TargetLowering::DAGCombinerInfo &DCI,
29512 const X86Subtarget &Subtarget) {
29513 SDValue Cond = N->getOperand(0);
29514 SDValue LHS = N->getOperand(1);
29515 SDValue RHS = N->getOperand(2);
29516 EVT VT = LHS.getValueType();
29517 EVT CondVT = Cond.getValueType();
29519 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29521 if (N->getOpcode() != ISD::VSELECT)
29524 assert(CondVT.isVector() && "Vector select expects a vector selector!");
29526 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29527 // Check if the first operand is all zeros and Cond type is vXi1.
29528 // This situation only applies to avx512.
29529 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29530 CondVT.getVectorElementType() == MVT::i1) {
29531 // Invert the cond to not(cond) : xor(op,allones)=not(op)
29532 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29533 DAG.getAllOnesConstant(DL, CondVT));
29534 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29535 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
29538 // To use the condition operand as a bitwise mask, it must have elements that
29539 // are the same size as the select elements. Ie, the condition operand must
29540 // have already been promoted from the IR select condition type <N x i1>.
29541 // Don't check if the types themselves are equal because that excludes
29542 // vector floating-point selects.
29543 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29546 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29547 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29549 // Try to invert the condition if true value is not all 1s and false value is
29551 if (!TValIsAllOnes && !FValIsAllZeros &&
29552 // Check if the selector will be produced by CMPP*/PCMP*.
29553 Cond.getOpcode() == ISD::SETCC &&
29554 // Check if SETCC has already been promoted.
29555 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29557 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29558 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29560 if (TValIsAllZeros || FValIsAllOnes) {
29561 SDValue CC = Cond.getOperand(2);
29562 ISD::CondCode NewCC =
29563 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29564 Cond.getOperand(0).getValueType().isInteger());
29565 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29567 std::swap(LHS, RHS);
29568 TValIsAllOnes = FValIsAllOnes;
29569 FValIsAllZeros = TValIsAllZeros;
29573 // vselect Cond, 111..., 000... -> Cond
29574 if (TValIsAllOnes && FValIsAllZeros)
29575 return DAG.getBitcast(VT, Cond);
29577 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29580 // vselect Cond, 111..., X -> or Cond, X
29581 if (TValIsAllOnes) {
29582 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29583 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29584 return DAG.getBitcast(VT, Or);
29587 // vselect Cond, X, 000... -> and Cond, X
29588 if (FValIsAllZeros) {
29589 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29590 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29591 return DAG.getBitcast(VT, And);
29597 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29598 SDValue Cond = N->getOperand(0);
29599 SDValue LHS = N->getOperand(1);
29600 SDValue RHS = N->getOperand(2);
29603 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29604 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29605 if (!TrueC || !FalseC)
29608 // Don't do this for crazy integer types.
29609 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29612 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
29613 // so that TrueC (the true value) is larger than FalseC.
29614 bool NeedsCondInvert = false;
29615 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29616 // Efficiently invertible.
29617 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
29618 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
29619 isa<ConstantSDNode>(Cond.getOperand(1))))) {
29620 NeedsCondInvert = true;
29621 std::swap(TrueC, FalseC);
29624 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
29625 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29626 if (NeedsCondInvert) // Invert the condition if needed.
29627 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29628 DAG.getConstant(1, DL, Cond.getValueType()));
29630 // Zero extend the condition if needed.
29631 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
29633 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29634 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
29635 DAG.getConstant(ShAmt, DL, MVT::i8));
29638 // Optimize cases that will turn into an LEA instruction. This requires
29639 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29640 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29641 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
29642 if (N->getValueType(0) == MVT::i32)
29643 Diff = (unsigned)Diff;
29645 bool isFastMultiplier = false;
29647 switch ((unsigned char)Diff) {
29650 case 1: // result = add base, cond
29651 case 2: // result = lea base( , cond*2)
29652 case 3: // result = lea base(cond, cond*2)
29653 case 4: // result = lea base( , cond*4)
29654 case 5: // result = lea base(cond, cond*4)
29655 case 8: // result = lea base( , cond*8)
29656 case 9: // result = lea base(cond, cond*8)
29657 isFastMultiplier = true;
29662 if (isFastMultiplier) {
29663 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
29664 if (NeedsCondInvert) // Invert the condition if needed.
29665 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29666 DAG.getConstant(1, DL, Cond.getValueType()));
29668 // Zero extend the condition if needed.
29669 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
29670 // Scale the condition by the difference.
29672 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29673 DAG.getConstant(Diff, DL, Cond.getValueType()));
29675 // Add the base if non-zero.
29676 if (FalseC->getAPIntValue() != 0)
29677 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29678 SDValue(FalseC, 0));
29686 // If this is a bitcasted op that can be represented as another type, push the
29687 // the bitcast to the inputs. This allows more opportunities for pattern
29688 // matching masked instructions. This is called when we know that the operation
29689 // is used as one of the inputs of a vselect.
29690 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
29691 TargetLowering::DAGCombinerInfo &DCI) {
29692 // Make sure we have a bitcast.
29693 if (OrigOp.getOpcode() != ISD::BITCAST)
29696 SDValue Op = OrigOp.getOperand(0);
29698 // If the operation is used by anything other than the bitcast, we shouldn't
29699 // do this combine as that would replicate the operation.
29700 if (!Op.hasOneUse())
29703 MVT VT = OrigOp.getSimpleValueType();
29704 MVT EltVT = VT.getVectorElementType();
29705 SDLoc DL(Op.getNode());
29707 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
29709 Op0 = DAG.getBitcast(VT, Op0);
29710 DCI.AddToWorklist(Op0.getNode());
29711 Op1 = DAG.getBitcast(VT, Op1);
29712 DCI.AddToWorklist(Op1.getNode());
29713 DCI.CombineTo(OrigOp.getNode(),
29714 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29718 unsigned Opcode = Op.getOpcode();
29720 case X86ISD::PALIGNR:
29721 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29722 if (!VT.is128BitVector())
29724 Opcode = X86ISD::VALIGN;
29726 case X86ISD::VALIGN: {
29727 if (EltVT != MVT::i32 && EltVT != MVT::i64)
29729 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29730 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29731 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29732 unsigned EltSize = EltVT.getSizeInBits();
29733 // Make sure we can represent the same shift with the new VT.
29734 if ((ShiftAmt % EltSize) != 0)
29736 Imm = ShiftAmt / EltSize;
29737 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29738 DAG.getConstant(Imm, DL, MVT::i8));
29740 case X86ISD::SHUF128: {
29741 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29743 // Only change element size, not type.
29744 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29746 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29749 case ISD::INSERT_SUBVECTOR: {
29750 unsigned EltSize = EltVT.getSizeInBits();
29751 if (EltSize != 32 && EltSize != 64)
29753 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29754 // Only change element size, not type.
29755 if (EltVT.isInteger() != OpEltVT.isInteger())
29757 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29758 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29759 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29760 DCI.AddToWorklist(Op0.getNode());
29761 // Op1 needs to be bitcasted to a smaller vector with the same element type.
29762 SDValue Op1 = Op.getOperand(1);
29763 MVT Op1VT = MVT::getVectorVT(EltVT,
29764 Op1.getSimpleValueType().getSizeInBits() / EltSize);
29765 Op1 = DAG.getBitcast(Op1VT, Op1);
29766 DCI.AddToWorklist(Op1.getNode());
29767 DCI.CombineTo(OrigOp.getNode(),
29768 DAG.getNode(Opcode, DL, VT, Op0, Op1,
29769 DAG.getIntPtrConstant(Imm, DL)));
29772 case ISD::EXTRACT_SUBVECTOR: {
29773 unsigned EltSize = EltVT.getSizeInBits();
29774 if (EltSize != 32 && EltSize != 64)
29776 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29777 // Only change element size, not type.
29778 if (EltVT.isInteger() != OpEltVT.isInteger())
29780 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29781 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29782 // Op0 needs to be bitcasted to a larger vector with the same element type.
29783 SDValue Op0 = Op.getOperand(0);
29784 MVT Op0VT = MVT::getVectorVT(EltVT,
29785 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29786 Op0 = DAG.getBitcast(Op0VT, Op0);
29787 DCI.AddToWorklist(Op0.getNode());
29788 DCI.CombineTo(OrigOp.getNode(),
29789 DAG.getNode(Opcode, DL, VT, Op0,
29790 DAG.getIntPtrConstant(Imm, DL)));
29793 case X86ISD::SUBV_BROADCAST: {
29794 unsigned EltSize = EltVT.getSizeInBits();
29795 if (EltSize != 32 && EltSize != 64)
29797 // Only change element size, not type.
29798 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29800 SDValue Op0 = Op.getOperand(0);
29801 MVT Op0VT = MVT::getVectorVT(EltVT,
29802 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29803 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
29804 DCI.AddToWorklist(Op0.getNode());
29805 DCI.CombineTo(OrigOp.getNode(),
29806 DAG.getNode(Opcode, DL, VT, Op0));
29814 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29815 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29816 TargetLowering::DAGCombinerInfo &DCI,
29817 const X86Subtarget &Subtarget) {
29819 SDValue Cond = N->getOperand(0);
29820 // Get the LHS/RHS of the select.
29821 SDValue LHS = N->getOperand(1);
29822 SDValue RHS = N->getOperand(2);
29823 EVT VT = LHS.getValueType();
29824 EVT CondVT = Cond.getValueType();
29825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29827 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29828 // instructions match the semantics of the common C idiom x<y?x:y but not
29829 // x<=y?x:y, because of how they handle negative zero (which can be
29830 // ignored in unsafe-math mode).
29831 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29832 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29833 VT != MVT::f80 && VT != MVT::f128 &&
29834 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29835 (Subtarget.hasSSE2() ||
29836 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29837 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29839 unsigned Opcode = 0;
29840 // Check for x CC y ? x : y.
29841 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29842 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29846 // Converting this to a min would handle NaNs incorrectly, and swapping
29847 // the operands would cause it to handle comparisons between positive
29848 // and negative zero incorrectly.
29849 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29850 if (!DAG.getTarget().Options.UnsafeFPMath &&
29851 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29853 std::swap(LHS, RHS);
29855 Opcode = X86ISD::FMIN;
29858 // Converting this to a min would handle comparisons between positive
29859 // and negative zero incorrectly.
29860 if (!DAG.getTarget().Options.UnsafeFPMath &&
29861 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29863 Opcode = X86ISD::FMIN;
29866 // Converting this to a min would handle both negative zeros and NaNs
29867 // incorrectly, but we can swap the operands to fix both.
29868 std::swap(LHS, RHS);
29872 Opcode = X86ISD::FMIN;
29876 // Converting this to a max would handle comparisons between positive
29877 // and negative zero incorrectly.
29878 if (!DAG.getTarget().Options.UnsafeFPMath &&
29879 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29881 Opcode = X86ISD::FMAX;
29884 // Converting this to a max would handle NaNs incorrectly, and swapping
29885 // the operands would cause it to handle comparisons between positive
29886 // and negative zero incorrectly.
29887 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29888 if (!DAG.getTarget().Options.UnsafeFPMath &&
29889 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29891 std::swap(LHS, RHS);
29893 Opcode = X86ISD::FMAX;
29896 // Converting this to a max would handle both negative zeros and NaNs
29897 // incorrectly, but we can swap the operands to fix both.
29898 std::swap(LHS, RHS);
29902 Opcode = X86ISD::FMAX;
29905 // Check for x CC y ? y : x -- a min/max with reversed arms.
29906 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
29907 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
29911 // Converting this to a min would handle comparisons between positive
29912 // and negative zero incorrectly, and swapping the operands would
29913 // cause it to handle NaNs incorrectly.
29914 if (!DAG.getTarget().Options.UnsafeFPMath &&
29915 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
29916 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29918 std::swap(LHS, RHS);
29920 Opcode = X86ISD::FMIN;
29923 // Converting this to a min would handle NaNs incorrectly.
29924 if (!DAG.getTarget().Options.UnsafeFPMath &&
29925 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
29927 Opcode = X86ISD::FMIN;
29930 // Converting this to a min would handle both negative zeros and NaNs
29931 // incorrectly, but we can swap the operands to fix both.
29932 std::swap(LHS, RHS);
29936 Opcode = X86ISD::FMIN;
29940 // Converting this to a max would handle NaNs incorrectly.
29941 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29943 Opcode = X86ISD::FMAX;
29946 // Converting this to a max would handle comparisons between positive
29947 // and negative zero incorrectly, and swapping the operands would
29948 // cause it to handle NaNs incorrectly.
29949 if (!DAG.getTarget().Options.UnsafeFPMath &&
29950 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
29951 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29953 std::swap(LHS, RHS);
29955 Opcode = X86ISD::FMAX;
29958 // Converting this to a max would handle both negative zeros and NaNs
29959 // incorrectly, but we can swap the operands to fix both.
29960 std::swap(LHS, RHS);
29964 Opcode = X86ISD::FMAX;
29970 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
29973 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
29974 // lowering on KNL. In this case we convert it to
29975 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
29976 // The same situation for all 128 and 256-bit vectors of i8 and i16.
29977 // Since SKX these selects have a proper lowering.
29978 if (Subtarget.hasAVX512() && CondVT.isVector() &&
29979 CondVT.getVectorElementType() == MVT::i1 &&
29980 (VT.is128BitVector() || VT.is256BitVector()) &&
29981 (VT.getVectorElementType() == MVT::i8 ||
29982 VT.getVectorElementType() == MVT::i16) &&
29983 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
29984 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
29985 DCI.AddToWorklist(Cond.getNode());
29986 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
29989 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
29992 // Canonicalize max and min:
29993 // (x > y) ? x : y -> (x >= y) ? x : y
29994 // (x < y) ? x : y -> (x <= y) ? x : y
29995 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
29996 // the need for an extra compare
29997 // against zero. e.g.
29998 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30000 // testl %edi, %edi
30002 // cmovgl %edi, %eax
30006 // cmovsl %eax, %edi
30007 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30008 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30009 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30010 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30015 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30016 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30017 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30018 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
30023 // Early exit check
30024 if (!TLI.isTypeLegal(VT))
30027 // Match VSELECTs into subs with unsigned saturation.
30028 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30029 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30030 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30031 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30032 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30034 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30035 // left side invert the predicate to simplify logic below.
30037 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30039 CC = ISD::getSetCCInverse(CC, true);
30040 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30044 if (Other.getNode() && Other->getNumOperands() == 2 &&
30045 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30046 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30047 SDValue CondRHS = Cond->getOperand(1);
30049 // Look for a general sub with unsigned saturation first.
30050 // x >= y ? x-y : 0 --> subus x, y
30051 // x > y ? x-y : 0 --> subus x, y
30052 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30053 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30054 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30056 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30057 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30058 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30059 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30060 // If the RHS is a constant we have to reverse the const
30061 // canonicalization.
30062 // x > C-1 ? x+-C : 0 --> subus x, C
30063 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30064 CondRHSConst->getAPIntValue() ==
30065 (-OpRHSConst->getAPIntValue() - 1))
30066 return DAG.getNode(
30067 X86ISD::SUBUS, DL, VT, OpLHS,
30068 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30070 // Another special case: If C was a sign bit, the sub has been
30071 // canonicalized into a xor.
30072 // FIXME: Would it be better to use computeKnownBits to determine
30073 // whether it's safe to decanonicalize the xor?
30074 // x s< 0 ? x^C : 0 --> subus x, C
30075 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30076 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30077 OpRHSConst->getAPIntValue().isSignMask())
30078 // Note that we have to rebuild the RHS constant here to ensure we
30079 // don't rely on particular values of undef lanes.
30080 return DAG.getNode(
30081 X86ISD::SUBUS, DL, VT, OpLHS,
30082 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30087 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30090 // If this is a *dynamic* select (non-constant condition) and we can match
30091 // this node with one of the variable blend instructions, restructure the
30092 // condition so that blends can use the high (sign) bit of each element and
30093 // use SimplifyDemandedBits to simplify the condition operand.
30094 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30095 !DCI.isBeforeLegalize() &&
30096 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30097 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30099 // Don't optimize vector selects that map to mask-registers.
30103 // We can only handle the cases where VSELECT is directly legal on the
30104 // subtarget. We custom lower VSELECT nodes with constant conditions and
30105 // this makes it hard to see whether a dynamic VSELECT will correctly
30106 // lower, so we both check the operation's status and explicitly handle the
30107 // cases where a *dynamic* blend will fail even though a constant-condition
30108 // blend could be custom lowered.
30109 // FIXME: We should find a better way to handle this class of problems.
30110 // Potentially, we should combine constant-condition vselect nodes
30111 // pre-legalization into shuffles and not mark as many types as custom
30113 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30115 // FIXME: We don't support i16-element blends currently. We could and
30116 // should support them by making *all* the bits in the condition be set
30117 // rather than just the high bit and using an i8-element blend.
30118 if (VT.getVectorElementType() == MVT::i16)
30120 // Dynamic blending was only available from SSE4.1 onward.
30121 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30123 // Byte blends are only available in AVX2
30124 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30127 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30128 APInt DemandedMask(APInt::getSignMask(BitWidth));
30130 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30131 DCI.isBeforeLegalizeOps());
30132 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30133 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30134 // If we changed the computation somewhere in the DAG, this change will
30135 // affect all users of Cond. Make sure it is fine and update all the nodes
30136 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30137 // perform wrong optimizations as we messed with the actual expectation
30138 // for the vector boolean values.
30139 if (Cond != TLO.Old) {
30140 // Check all uses of the condition operand to check whether it will be
30141 // consumed by non-BLEND instructions. Those may require that all bits
30142 // are set properly.
30143 for (SDNode *U : Cond->uses()) {
30144 // TODO: Add other opcodes eventually lowered into BLEND.
30145 if (U->getOpcode() != ISD::VSELECT)
30149 // Update all users of the condition before committing the change, so
30150 // that the VSELECT optimizations that expect the correct vector boolean
30151 // value will not be triggered.
30152 for (SDNode *U : Cond->uses()) {
30153 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30154 U->getValueType(0), Cond, U->getOperand(1),
30156 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30158 DCI.CommitTargetLoweringOpt(TLO);
30161 // Only Cond (rather than other nodes in the computation chain) was
30162 // changed. Change the condition just for N to keep the opportunity to
30163 // optimize all other users their own way.
30164 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30165 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30170 // Look for vselects with LHS/RHS being bitcasted from an operation that
30171 // can be executed on another type. Push the bitcast to the inputs of
30172 // the operation. This exposes opportunities for using masking instructions.
30173 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30174 CondVT.getVectorElementType() == MVT::i1) {
30175 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30176 return SDValue(N, 0);
30177 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30178 return SDValue(N, 0);
30185 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30187 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30188 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30189 /// Note that this is only legal for some op/cc combinations.
30190 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30191 SelectionDAG &DAG) {
30192 // This combine only operates on CMP-like nodes.
30193 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30194 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30197 // Can't replace the cmp if it has more uses than the one we're looking at.
30198 // FIXME: We would like to be able to handle this, but would need to make sure
30199 // all uses were updated.
30200 if (!Cmp.hasOneUse())
30203 // This only applies to variations of the common case:
30204 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30205 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30206 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30207 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30208 // Using the proper condcodes (see below), overflow is checked for.
30210 // FIXME: We can generalize both constraints:
30211 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30213 // if the result is compared.
30215 SDValue CmpLHS = Cmp.getOperand(0);
30216 SDValue CmpRHS = Cmp.getOperand(1);
30218 if (!CmpLHS.hasOneUse())
30221 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30222 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30225 const unsigned Opc = CmpLHS.getOpcode();
30227 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30230 SDValue OpRHS = CmpLHS.getOperand(2);
30231 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30235 APInt Addend = OpRHSC->getAPIntValue();
30236 if (Opc == ISD::ATOMIC_LOAD_SUB)
30239 if (CC == X86::COND_S && Addend == 1)
30241 else if (CC == X86::COND_NS && Addend == 1)
30243 else if (CC == X86::COND_G && Addend == -1)
30245 else if (CC == X86::COND_LE && Addend == -1)
30250 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30251 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30252 DAG.getUNDEF(CmpLHS.getValueType()));
30253 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30257 // Check whether a boolean test is testing a boolean value generated by
30258 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30261 // Simplify the following patterns:
30262 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30263 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30264 // to (Op EFLAGS Cond)
30266 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30267 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30268 // to (Op EFLAGS !Cond)
30270 // where Op could be BRCOND or CMOV.
30272 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30273 // This combine only operates on CMP-like nodes.
30274 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30275 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30278 // Quit if not used as a boolean value.
30279 if (CC != X86::COND_E && CC != X86::COND_NE)
30282 // Check CMP operands. One of them should be 0 or 1 and the other should be
30283 // an SetCC or extended from it.
30284 SDValue Op1 = Cmp.getOperand(0);
30285 SDValue Op2 = Cmp.getOperand(1);
30288 const ConstantSDNode* C = nullptr;
30289 bool needOppositeCond = (CC == X86::COND_E);
30290 bool checkAgainstTrue = false; // Is it a comparison against 1?
30292 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30294 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30296 else // Quit if all operands are not constants.
30299 if (C->getZExtValue() == 1) {
30300 needOppositeCond = !needOppositeCond;
30301 checkAgainstTrue = true;
30302 } else if (C->getZExtValue() != 0)
30303 // Quit if the constant is neither 0 or 1.
30306 bool truncatedToBoolWithAnd = false;
30307 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30308 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30309 SetCC.getOpcode() == ISD::TRUNCATE ||
30310 SetCC.getOpcode() == ISD::AND) {
30311 if (SetCC.getOpcode() == ISD::AND) {
30313 if (isOneConstant(SetCC.getOperand(0)))
30315 if (isOneConstant(SetCC.getOperand(1)))
30319 SetCC = SetCC.getOperand(OpIdx);
30320 truncatedToBoolWithAnd = true;
30322 SetCC = SetCC.getOperand(0);
30325 switch (SetCC.getOpcode()) {
30326 case X86ISD::SETCC_CARRY:
30327 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30328 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30329 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30330 // truncated to i1 using 'and'.
30331 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30333 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30334 "Invalid use of SETCC_CARRY!");
30336 case X86ISD::SETCC:
30337 // Set the condition code or opposite one if necessary.
30338 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30339 if (needOppositeCond)
30340 CC = X86::GetOppositeBranchCondition(CC);
30341 return SetCC.getOperand(1);
30342 case X86ISD::CMOV: {
30343 // Check whether false/true value has canonical one, i.e. 0 or 1.
30344 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30345 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30346 // Quit if true value is not a constant.
30349 // Quit if false value is not a constant.
30351 SDValue Op = SetCC.getOperand(0);
30352 // Skip 'zext' or 'trunc' node.
30353 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30354 Op.getOpcode() == ISD::TRUNCATE)
30355 Op = Op.getOperand(0);
30356 // A special case for rdrand/rdseed, where 0 is set if false cond is
30358 if ((Op.getOpcode() != X86ISD::RDRAND &&
30359 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30362 // Quit if false value is not the constant 0 or 1.
30363 bool FValIsFalse = true;
30364 if (FVal && FVal->getZExtValue() != 0) {
30365 if (FVal->getZExtValue() != 1)
30367 // If FVal is 1, opposite cond is needed.
30368 needOppositeCond = !needOppositeCond;
30369 FValIsFalse = false;
30371 // Quit if TVal is not the constant opposite of FVal.
30372 if (FValIsFalse && TVal->getZExtValue() != 1)
30374 if (!FValIsFalse && TVal->getZExtValue() != 0)
30376 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30377 if (needOppositeCond)
30378 CC = X86::GetOppositeBranchCondition(CC);
30379 return SetCC.getOperand(3);
30386 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30388 /// (X86or (X86setcc) (X86setcc))
30389 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30390 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30391 X86::CondCode &CC1, SDValue &Flags,
30393 if (Cond->getOpcode() == X86ISD::CMP) {
30394 if (!isNullConstant(Cond->getOperand(1)))
30397 Cond = Cond->getOperand(0);
30402 SDValue SetCC0, SetCC1;
30403 switch (Cond->getOpcode()) {
30404 default: return false;
30411 SetCC0 = Cond->getOperand(0);
30412 SetCC1 = Cond->getOperand(1);
30416 // Make sure we have SETCC nodes, using the same flags value.
30417 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30418 SetCC1.getOpcode() != X86ISD::SETCC ||
30419 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30422 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30423 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30424 Flags = SetCC0->getOperand(1);
30428 /// Optimize an EFLAGS definition used according to the condition code \p CC
30429 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30430 /// uses of chain values.
30431 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30432 SelectionDAG &DAG) {
30433 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30435 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30438 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30439 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30440 TargetLowering::DAGCombinerInfo &DCI,
30441 const X86Subtarget &Subtarget) {
30444 // If the flag operand isn't dead, don't touch this CMOV.
30445 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30448 SDValue FalseOp = N->getOperand(0);
30449 SDValue TrueOp = N->getOperand(1);
30450 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30451 SDValue Cond = N->getOperand(3);
30453 if (CC == X86::COND_E || CC == X86::COND_NE) {
30454 switch (Cond.getOpcode()) {
30458 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30459 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30460 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30464 // Try to simplify the EFLAGS and condition code operands.
30465 // We can't always do this as FCMOV only supports a subset of X86 cond.
30466 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30467 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30468 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30470 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30474 // If this is a select between two integer constants, try to do some
30475 // optimizations. Note that the operands are ordered the opposite of SELECT
30477 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30478 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30479 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30480 // larger than FalseC (the false value).
30481 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30482 CC = X86::GetOppositeBranchCondition(CC);
30483 std::swap(TrueC, FalseC);
30484 std::swap(TrueOp, FalseOp);
30487 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30488 // This is efficient for any integer data type (including i8/i16) and
30490 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30491 Cond = getSETCC(CC, Cond, DL, DAG);
30493 // Zero extend the condition if needed.
30494 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30496 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30497 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30498 DAG.getConstant(ShAmt, DL, MVT::i8));
30499 if (N->getNumValues() == 2) // Dead flag value?
30500 return DCI.CombineTo(N, Cond, SDValue());
30504 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30505 // for any integer data type, including i8/i16.
30506 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30507 Cond = getSETCC(CC, Cond, DL, DAG);
30509 // Zero extend the condition if needed.
30510 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30511 FalseC->getValueType(0), Cond);
30512 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30513 SDValue(FalseC, 0));
30515 if (N->getNumValues() == 2) // Dead flag value?
30516 return DCI.CombineTo(N, Cond, SDValue());
30520 // Optimize cases that will turn into an LEA instruction. This requires
30521 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30522 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30523 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30524 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30526 bool isFastMultiplier = false;
30528 switch ((unsigned char)Diff) {
30530 case 1: // result = add base, cond
30531 case 2: // result = lea base( , cond*2)
30532 case 3: // result = lea base(cond, cond*2)
30533 case 4: // result = lea base( , cond*4)
30534 case 5: // result = lea base(cond, cond*4)
30535 case 8: // result = lea base( , cond*8)
30536 case 9: // result = lea base(cond, cond*8)
30537 isFastMultiplier = true;
30542 if (isFastMultiplier) {
30543 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30544 Cond = getSETCC(CC, Cond, DL ,DAG);
30545 // Zero extend the condition if needed.
30546 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30548 // Scale the condition by the difference.
30550 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30551 DAG.getConstant(Diff, DL, Cond.getValueType()));
30553 // Add the base if non-zero.
30554 if (FalseC->getAPIntValue() != 0)
30555 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30556 SDValue(FalseC, 0));
30557 if (N->getNumValues() == 2) // Dead flag value?
30558 return DCI.CombineTo(N, Cond, SDValue());
30565 // Handle these cases:
30566 // (select (x != c), e, c) -> select (x != c), e, x),
30567 // (select (x == c), c, e) -> select (x == c), x, e)
30568 // where the c is an integer constant, and the "select" is the combination
30569 // of CMOV and CMP.
30571 // The rationale for this change is that the conditional-move from a constant
30572 // needs two instructions, however, conditional-move from a register needs
30573 // only one instruction.
30575 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
30576 // some instruction-combining opportunities. This opt needs to be
30577 // postponed as late as possible.
30579 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30580 // the DCI.xxxx conditions are provided to postpone the optimization as
30581 // late as possible.
30583 ConstantSDNode *CmpAgainst = nullptr;
30584 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
30585 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30586 !isa<ConstantSDNode>(Cond.getOperand(0))) {
30588 if (CC == X86::COND_NE &&
30589 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30590 CC = X86::GetOppositeBranchCondition(CC);
30591 std::swap(TrueOp, FalseOp);
30594 if (CC == X86::COND_E &&
30595 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30596 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30597 DAG.getConstant(CC, DL, MVT::i8), Cond };
30598 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30603 // Fold and/or of setcc's to double CMOV:
30604 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30605 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30607 // This combine lets us generate:
30608 // cmovcc1 (jcc1 if we don't have CMOV)
30614 // cmovne (jne if we don't have CMOV)
30615 // When we can't use the CMOV instruction, it might increase branch
30617 // When we can use CMOV, or when there is no mispredict, this improves
30618 // throughput and reduces register pressure.
30620 if (CC == X86::COND_NE) {
30622 X86::CondCode CC0, CC1;
30624 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
30626 std::swap(FalseOp, TrueOp);
30627 CC0 = X86::GetOppositeBranchCondition(CC0);
30628 CC1 = X86::GetOppositeBranchCondition(CC1);
30631 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
30633 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
30634 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
30635 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30636 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
30644 /// Different mul shrinking modes.
30645 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
30647 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
30648 EVT VT = N->getOperand(0).getValueType();
30649 if (VT.getScalarSizeInBits() != 32)
30652 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
30653 unsigned SignBits[2] = {1, 1};
30654 bool IsPositive[2] = {false, false};
30655 for (unsigned i = 0; i < 2; i++) {
30656 SDValue Opd = N->getOperand(i);
30658 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
30659 // compute signbits for it separately.
30660 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
30661 // For anyextend, it is safe to assume an appropriate number of leading
30663 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
30665 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
30670 IsPositive[i] = true;
30671 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
30672 // All the operands of BUILD_VECTOR need to be int constant.
30673 // Find the smallest value range which all the operands belong to.
30675 IsPositive[i] = true;
30676 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
30677 if (SubOp.isUndef())
30679 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
30682 APInt IntVal = CN->getAPIntValue();
30683 if (IntVal.isNegative())
30684 IsPositive[i] = false;
30685 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
30688 SignBits[i] = DAG.ComputeNumSignBits(Opd);
30689 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
30690 IsPositive[i] = true;
30694 bool AllPositive = IsPositive[0] && IsPositive[1];
30695 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
30696 // When ranges are from -128 ~ 127, use MULS8 mode.
30697 if (MinSignBits >= 25)
30699 // When ranges are from 0 ~ 255, use MULU8 mode.
30700 else if (AllPositive && MinSignBits >= 24)
30702 // When ranges are from -32768 ~ 32767, use MULS16 mode.
30703 else if (MinSignBits >= 17)
30705 // When ranges are from 0 ~ 65535, use MULU16 mode.
30706 else if (AllPositive && MinSignBits >= 16)
30713 /// When the operands of vector mul are extended from smaller size values,
30714 /// like i8 and i16, the type of mul may be shrinked to generate more
30715 /// efficient code. Two typical patterns are handled:
30717 /// %2 = sext/zext <N x i8> %1 to <N x i32>
30718 /// %4 = sext/zext <N x i8> %3 to <N x i32>
30719 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30720 /// %5 = mul <N x i32> %2, %4
30723 /// %2 = zext/sext <N x i16> %1 to <N x i32>
30724 /// %4 = zext/sext <N x i16> %3 to <N x i32>
30725 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30726 /// %5 = mul <N x i32> %2, %4
30728 /// There are four mul shrinking modes:
30729 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30730 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30731 /// generate pmullw+sext32 for it (MULS8 mode).
30732 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30733 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30734 /// generate pmullw+zext32 for it (MULU8 mode).
30735 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30736 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30737 /// generate pmullw+pmulhw for it (MULS16 mode).
30738 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30739 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30740 /// generate pmullw+pmulhuw for it (MULU16 mode).
30741 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30742 const X86Subtarget &Subtarget) {
30743 // Check for legality
30744 // pmullw/pmulhw are not supported by SSE.
30745 if (!Subtarget.hasSSE2())
30748 // Check for profitability
30749 // pmulld is supported since SSE41. It is better to use pmulld
30750 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30752 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30753 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30757 if (!canReduceVMulWidth(N, DAG, Mode))
30761 SDValue N0 = N->getOperand(0);
30762 SDValue N1 = N->getOperand(1);
30763 EVT VT = N->getOperand(0).getValueType();
30764 unsigned RegSize = 128;
30765 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30767 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30768 // Shrink the operands of mul.
30769 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30770 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30772 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30773 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30774 // lower part is needed.
30775 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30776 if (Mode == MULU8 || Mode == MULS8) {
30777 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30780 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30781 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30782 // the higher part is also needed.
30783 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30784 ReducedVT, NewN0, NewN1);
30786 // Repack the lower part and higher part result of mul into a wider
30788 // Generate shuffle functioning as punpcklwd.
30789 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30790 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30791 ShuffleMask[2 * i] = i;
30792 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30795 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30796 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30797 // Generate shuffle functioning as punpckhwd.
30798 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30799 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30800 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30803 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30804 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30805 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30808 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30809 // to legalize the mul explicitly because implicit legalization for type
30810 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30811 // instructions which will not exist when we explicitly legalize it by
30812 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30813 // <4 x i16> undef).
30815 // Legalize the operands of mul.
30816 // FIXME: We may be able to handle non-concatenated vectors by insertion.
30817 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30818 if ((RegSize % ReducedSizeInBits) != 0)
30821 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30822 DAG.getUNDEF(ReducedVT));
30824 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30826 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30828 if (Mode == MULU8 || Mode == MULS8) {
30829 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30831 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30833 // convert the type of mul result to VT.
30834 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30835 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30836 : ISD::SIGN_EXTEND_VECTOR_INREG,
30838 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30839 DAG.getIntPtrConstant(0, DL));
30841 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30842 // MULU16/MULS16, both parts are needed.
30843 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30844 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30845 OpsVT, NewN0, NewN1);
30847 // Repack the lower part and higher part result of mul into a wider
30848 // result. Make sure the type of mul result is VT.
30849 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30850 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
30851 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
30852 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30853 DAG.getIntPtrConstant(0, DL));
30858 /// Optimize a single multiply with constant into two operations in order to
30859 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
30860 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
30861 TargetLowering::DAGCombinerInfo &DCI,
30862 const X86Subtarget &Subtarget) {
30863 EVT VT = N->getValueType(0);
30864 if (DCI.isBeforeLegalize() && VT.isVector())
30865 return reduceVMULWidth(N, DAG, Subtarget);
30867 // An imul is usually smaller than the alternative sequence.
30868 if (DAG.getMachineFunction().getFunction()->optForMinSize())
30871 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
30874 if (VT != MVT::i64 && VT != MVT::i32)
30877 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
30880 uint64_t MulAmt = C->getZExtValue();
30881 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
30884 uint64_t MulAmt1 = 0;
30885 uint64_t MulAmt2 = 0;
30886 if ((MulAmt % 9) == 0) {
30888 MulAmt2 = MulAmt / 9;
30889 } else if ((MulAmt % 5) == 0) {
30891 MulAmt2 = MulAmt / 5;
30892 } else if ((MulAmt % 3) == 0) {
30894 MulAmt2 = MulAmt / 3;
30900 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
30902 if (isPowerOf2_64(MulAmt2) &&
30903 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
30904 // If second multiplifer is pow2, issue it first. We want the multiply by
30905 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
30907 std::swap(MulAmt1, MulAmt2);
30909 if (isPowerOf2_64(MulAmt1))
30910 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30911 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
30913 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30914 DAG.getConstant(MulAmt1, DL, VT));
30916 if (isPowerOf2_64(MulAmt2))
30917 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
30918 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
30920 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
30921 DAG.getConstant(MulAmt2, DL, VT));
30925 assert(MulAmt != 0 &&
30926 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
30927 "Both cases that could cause potential overflows should have "
30928 "already been handled.");
30929 int64_t SignMulAmt = C->getSExtValue();
30930 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
30931 (SignMulAmt != -INT64_MAX)) {
30932 int NumSign = SignMulAmt > 0 ? 1 : -1;
30933 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
30934 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
30935 if (IsPowerOf2_64PlusOne) {
30936 // (mul x, 2^N + 1) => (add (shl x, N), x)
30937 NewMul = DAG.getNode(
30938 ISD::ADD, DL, VT, N->getOperand(0),
30939 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30940 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
30942 } else if (IsPowerOf2_64MinusOne) {
30943 // (mul x, 2^N - 1) => (sub (shl x, N), x)
30944 NewMul = DAG.getNode(
30946 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30947 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
30951 // To negate, subtract the number from zero
30952 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
30954 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
30959 // Do not add new nodes to DAG combiner worklist.
30960 DCI.CombineTo(N, NewMul, false);
30965 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
30966 SDValue N0 = N->getOperand(0);
30967 SDValue N1 = N->getOperand(1);
30968 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
30969 EVT VT = N0.getValueType();
30971 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
30972 // since the result of setcc_c is all zero's or all ones.
30973 if (VT.isInteger() && !VT.isVector() &&
30974 N1C && N0.getOpcode() == ISD::AND &&
30975 N0.getOperand(1).getOpcode() == ISD::Constant) {
30976 SDValue N00 = N0.getOperand(0);
30977 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
30978 Mask <<= N1C->getAPIntValue();
30979 bool MaskOK = false;
30980 // We can handle cases concerning bit-widening nodes containing setcc_c if
30981 // we carefully interrogate the mask to make sure we are semantics
30983 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
30984 // of the underlying setcc_c operation if the setcc_c was zero extended.
30985 // Consider the following example:
30986 // zext(setcc_c) -> i32 0x0000FFFF
30987 // c1 -> i32 0x0000FFFF
30988 // c2 -> i32 0x00000001
30989 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
30990 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
30991 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30993 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
30994 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
30996 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
30997 N00.getOpcode() == ISD::ANY_EXTEND) &&
30998 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
30999 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31001 if (MaskOK && Mask != 0) {
31003 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31007 // Hardware support for vector shifts is sparse which makes us scalarize the
31008 // vector operations in many cases. Also, on sandybridge ADD is faster than
31010 // (shl V, 1) -> add V,V
31011 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31012 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31013 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31014 // We shift all of the values by one. In many cases we do not have
31015 // hardware support for this operation. This is better expressed as an ADD
31017 if (N1SplatC->getAPIntValue() == 1)
31018 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31024 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31025 SDValue N0 = N->getOperand(0);
31026 SDValue N1 = N->getOperand(1);
31027 EVT VT = N0.getValueType();
31028 unsigned Size = VT.getSizeInBits();
31030 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31031 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31032 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31033 // depending on sign of (SarConst - [56,48,32,24,16])
31035 // sexts in X86 are MOVs. The MOVs have the same code size
31036 // as above SHIFTs (only SHIFT on 1 has lower code size).
31037 // However the MOVs have 2 advantages to a SHIFT:
31038 // 1. MOVs can write to a register that differs from source
31039 // 2. MOVs accept memory operands
31041 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31042 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31043 N0.getOperand(1).getOpcode() != ISD::Constant)
31046 SDValue N00 = N0.getOperand(0);
31047 SDValue N01 = N0.getOperand(1);
31048 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31049 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31050 EVT CVT = N1.getValueType();
31052 if (SarConst.isNegative())
31055 for (MVT SVT : MVT::integer_valuetypes()) {
31056 unsigned ShiftSize = SVT.getSizeInBits();
31057 // skipping types without corresponding sext/zext and
31058 // ShlConst that is not one of [56,48,32,24,16]
31059 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31063 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31064 SarConst = SarConst - (Size - ShiftSize);
31067 else if (SarConst.isNegative())
31068 return DAG.getNode(ISD::SHL, DL, VT, NN,
31069 DAG.getConstant(-SarConst, DL, CVT));
31071 return DAG.getNode(ISD::SRA, DL, VT, NN,
31072 DAG.getConstant(SarConst, DL, CVT));
31077 /// \brief Returns a vector of 0s if the node in input is a vector logical
31078 /// shift by a constant amount which is known to be bigger than or equal
31079 /// to the vector element size in bits.
31080 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31081 const X86Subtarget &Subtarget) {
31082 EVT VT = N->getValueType(0);
31084 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31085 (!Subtarget.hasInt256() ||
31086 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31089 SDValue Amt = N->getOperand(1);
31091 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31092 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31093 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31094 unsigned MaxAmount =
31095 VT.getSimpleVT().getScalarSizeInBits();
31097 // SSE2/AVX2 logical shifts always return a vector of 0s
31098 // if the shift amount is bigger than or equal to
31099 // the element size. The constant shift amount will be
31100 // encoded as a 8-bit immediate.
31101 if (ShiftAmt.trunc(8).uge(MaxAmount))
31102 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31108 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31109 TargetLowering::DAGCombinerInfo &DCI,
31110 const X86Subtarget &Subtarget) {
31111 if (N->getOpcode() == ISD::SHL)
31112 if (SDValue V = combineShiftLeft(N, DAG))
31115 if (N->getOpcode() == ISD::SRA)
31116 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31119 // Try to fold this logical shift into a zero vector.
31120 if (N->getOpcode() != ISD::SRA)
31121 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31127 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31128 TargetLowering::DAGCombinerInfo &DCI,
31129 const X86Subtarget &Subtarget) {
31130 unsigned Opcode = N->getOpcode();
31131 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31132 X86ISD::VSRLI == Opcode) &&
31133 "Unexpected shift opcode");
31134 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31135 EVT VT = N->getValueType(0);
31136 SDValue N0 = N->getOperand(0);
31137 SDValue N1 = N->getOperand(1);
31138 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31139 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31140 "Unexpected value type");
31142 // Out of range logical bit shifts are guaranteed to be zero.
31143 // Out of range arithmetic bit shifts splat the sign bit.
31144 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31145 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31147 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31149 ShiftVal = NumBitsPerElt - 1;
31152 // Shift N0 by zero -> N0.
31156 // Shift zero -> zero.
31157 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31158 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31160 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31161 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31162 // TODO - support other sra opcodes as needed.
31163 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31164 N0.getOpcode() == X86ISD::VSRAI)
31165 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31167 // We can decode 'whole byte' logical bit shifts as shuffles.
31168 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31170 SmallVector<int, 1> NonceMask; // Just a placeholder.
31171 NonceMask.push_back(0);
31172 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31173 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31175 return SDValue(); // This routine will use CombineTo to replace N.
31178 // Constant Folding.
31180 SmallVector<APInt, 32> EltBits;
31181 if (N->isOnlyUserOf(N0.getNode()) &&
31182 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31183 assert(EltBits.size() == VT.getVectorNumElements() &&
31184 "Unexpected shift value type");
31185 unsigned ShiftImm = ShiftVal.getZExtValue();
31186 for (APInt &Elt : EltBits) {
31187 if (X86ISD::VSHLI == Opcode)
31189 else if (X86ISD::VSRAI == Opcode)
31190 Elt.ashrInPlace(ShiftImm);
31192 Elt.lshrInPlace(ShiftImm);
31194 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31200 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31201 TargetLowering::DAGCombinerInfo &DCI,
31202 const X86Subtarget &Subtarget) {
31204 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31205 (N->getOpcode() == X86ISD::PINSRW &&
31206 N->getValueType(0) == MVT::v8i16)) &&
31207 "Unexpected vector insertion");
31209 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31211 SmallVector<int, 1> NonceMask; // Just a placeholder.
31212 NonceMask.push_back(0);
31213 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31214 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31219 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31220 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31221 /// OR -> CMPNEQSS.
31222 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31223 TargetLowering::DAGCombinerInfo &DCI,
31224 const X86Subtarget &Subtarget) {
31227 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31228 // we're requiring SSE2 for both.
31229 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31230 SDValue N0 = N->getOperand(0);
31231 SDValue N1 = N->getOperand(1);
31232 SDValue CMP0 = N0->getOperand(1);
31233 SDValue CMP1 = N1->getOperand(1);
31236 // The SETCCs should both refer to the same CMP.
31237 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31240 SDValue CMP00 = CMP0->getOperand(0);
31241 SDValue CMP01 = CMP0->getOperand(1);
31242 EVT VT = CMP00.getValueType();
31244 if (VT == MVT::f32 || VT == MVT::f64) {
31245 bool ExpectingFlags = false;
31246 // Check for any users that want flags:
31247 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31248 !ExpectingFlags && UI != UE; ++UI)
31249 switch (UI->getOpcode()) {
31254 ExpectingFlags = true;
31256 case ISD::CopyToReg:
31257 case ISD::SIGN_EXTEND:
31258 case ISD::ZERO_EXTEND:
31259 case ISD::ANY_EXTEND:
31263 if (!ExpectingFlags) {
31264 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31265 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31267 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31268 X86::CondCode tmp = cc0;
31273 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31274 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31275 // FIXME: need symbolic constants for these magic numbers.
31276 // See X86ATTInstPrinter.cpp:printSSECC().
31277 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31278 if (Subtarget.hasAVX512()) {
31279 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
31281 DAG.getConstant(x86cc, DL, MVT::i8));
31282 if (N->getValueType(0) != MVT::i1)
31283 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
31287 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31288 CMP00.getValueType(), CMP00, CMP01,
31289 DAG.getConstant(x86cc, DL,
31292 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31293 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31295 if (is64BitFP && !Subtarget.is64Bit()) {
31296 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31297 // 64-bit integer, since that's not a legal type. Since
31298 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31299 // bits, but can do this little dance to extract the lowest 32 bits
31300 // and work with those going forward.
31301 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31303 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31304 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31305 Vector32, DAG.getIntPtrConstant(0, DL));
31309 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31310 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31311 DAG.getConstant(1, DL, IntVT));
31312 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31314 return OneBitOfTruth;
31322 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31323 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31324 assert(N->getOpcode() == ISD::AND);
31326 EVT VT = N->getValueType(0);
31327 SDValue N0 = N->getOperand(0);
31328 SDValue N1 = N->getOperand(1);
31331 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31334 if (N0.getOpcode() == ISD::XOR &&
31335 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31336 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31338 if (N1.getOpcode() == ISD::XOR &&
31339 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31340 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31345 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31346 // register. In most cases we actually compare or select YMM-sized registers
31347 // and mixing the two types creates horrible code. This method optimizes
31348 // some of the transition sequences.
31349 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31350 TargetLowering::DAGCombinerInfo &DCI,
31351 const X86Subtarget &Subtarget) {
31352 EVT VT = N->getValueType(0);
31353 if (!VT.is256BitVector())
31356 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31357 N->getOpcode() == ISD::ZERO_EXTEND ||
31358 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31360 SDValue Narrow = N->getOperand(0);
31361 EVT NarrowVT = Narrow->getValueType(0);
31362 if (!NarrowVT.is128BitVector())
31365 if (Narrow->getOpcode() != ISD::XOR &&
31366 Narrow->getOpcode() != ISD::AND &&
31367 Narrow->getOpcode() != ISD::OR)
31370 SDValue N0 = Narrow->getOperand(0);
31371 SDValue N1 = Narrow->getOperand(1);
31374 // The Left side has to be a trunc.
31375 if (N0.getOpcode() != ISD::TRUNCATE)
31378 // The type of the truncated inputs.
31379 EVT WideVT = N0->getOperand(0)->getValueType(0);
31383 // The right side has to be a 'trunc' or a constant vector.
31384 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31385 ConstantSDNode *RHSConstSplat = nullptr;
31386 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31387 RHSConstSplat = RHSBV->getConstantSplatNode();
31388 if (!RHSTrunc && !RHSConstSplat)
31391 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31393 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31396 // Set N0 and N1 to hold the inputs to the new wide operation.
31397 N0 = N0->getOperand(0);
31398 if (RHSConstSplat) {
31399 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31400 SDValue(RHSConstSplat, 0));
31401 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31402 } else if (RHSTrunc) {
31403 N1 = N1->getOperand(0);
31406 // Generate the wide operation.
31407 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31408 unsigned Opcode = N->getOpcode();
31410 case ISD::ANY_EXTEND:
31412 case ISD::ZERO_EXTEND: {
31413 unsigned InBits = NarrowVT.getScalarSizeInBits();
31414 APInt Mask = APInt::getAllOnesValue(InBits);
31415 Mask = Mask.zext(VT.getScalarSizeInBits());
31416 return DAG.getNode(ISD::AND, DL, VT,
31417 Op, DAG.getConstant(Mask, DL, VT));
31419 case ISD::SIGN_EXTEND:
31420 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31421 Op, DAG.getValueType(NarrowVT));
31423 llvm_unreachable("Unexpected opcode");
31427 /// If both input operands of a logic op are being cast from floating point
31428 /// types, try to convert this into a floating point logic node to avoid
31429 /// unnecessary moves from SSE to integer registers.
31430 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31431 const X86Subtarget &Subtarget) {
31432 unsigned FPOpcode = ISD::DELETED_NODE;
31433 if (N->getOpcode() == ISD::AND)
31434 FPOpcode = X86ISD::FAND;
31435 else if (N->getOpcode() == ISD::OR)
31436 FPOpcode = X86ISD::FOR;
31437 else if (N->getOpcode() == ISD::XOR)
31438 FPOpcode = X86ISD::FXOR;
31440 assert(FPOpcode != ISD::DELETED_NODE &&
31441 "Unexpected input node for FP logic conversion");
31443 EVT VT = N->getValueType(0);
31444 SDValue N0 = N->getOperand(0);
31445 SDValue N1 = N->getOperand(1);
31447 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31448 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
31449 (Subtarget.hasSSE2() && VT == MVT::i64))) {
31450 SDValue N00 = N0.getOperand(0);
31451 SDValue N10 = N1.getOperand(0);
31452 EVT N00Type = N00.getValueType();
31453 EVT N10Type = N10.getValueType();
31454 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31455 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31456 return DAG.getBitcast(VT, FPLogic);
31462 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
31463 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31464 /// with a shift-right to eliminate loading the vector constant mask value.
31465 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31466 const X86Subtarget &Subtarget) {
31467 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31468 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31469 EVT VT0 = Op0.getValueType();
31470 EVT VT1 = Op1.getValueType();
31472 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
31476 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
31477 !SplatVal.isMask())
31480 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31483 unsigned EltBitWidth = VT0.getScalarSizeInBits();
31484 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31488 unsigned ShiftVal = SplatVal.countTrailingOnes();
31489 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31490 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31491 return DAG.getBitcast(N->getValueType(0), Shift);
31494 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31495 TargetLowering::DAGCombinerInfo &DCI,
31496 const X86Subtarget &Subtarget) {
31497 if (DCI.isBeforeLegalizeOps())
31500 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31503 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31506 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31509 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31512 EVT VT = N->getValueType(0);
31513 SDValue N0 = N->getOperand(0);
31514 SDValue N1 = N->getOperand(1);
31517 // Attempt to recursively combine a bitmask AND with shuffles.
31518 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31520 SmallVector<int, 1> NonceMask; // Just a placeholder.
31521 NonceMask.push_back(0);
31522 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31523 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31525 return SDValue(); // This routine will use CombineTo to replace N.
31528 // Create BEXTR instructions
31529 // BEXTR is ((X >> imm) & (2**size-1))
31530 if (VT != MVT::i32 && VT != MVT::i64)
31533 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31535 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31538 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31539 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31540 if (MaskNode && ShiftNode) {
31541 uint64_t Mask = MaskNode->getZExtValue();
31542 uint64_t Shift = ShiftNode->getZExtValue();
31543 if (isMask_64(Mask)) {
31544 uint64_t MaskSize = countPopulation(Mask);
31545 if (Shift + MaskSize <= VT.getSizeInBits())
31546 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
31547 DAG.getConstant(Shift | (MaskSize << 8), DL,
31555 // (or (and (m, y), (pandn m, x)))
31557 // (vselect m, x, y)
31558 // As a special case, try to fold:
31559 // (or (and (m, (sub 0, x)), (pandn m, x)))
31561 // (sub (xor X, M), M)
31562 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
31563 const X86Subtarget &Subtarget) {
31564 assert(N->getOpcode() == ISD::OR);
31566 SDValue N0 = N->getOperand(0);
31567 SDValue N1 = N->getOperand(1);
31568 EVT VT = N->getValueType(0);
31570 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
31572 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
31574 // Canonicalize pandn to RHS
31575 if (N0.getOpcode() == X86ISD::ANDNP)
31578 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
31581 SDValue Mask = N1.getOperand(0);
31582 SDValue X = N1.getOperand(1);
31584 if (N0.getOperand(0) == Mask)
31585 Y = N0.getOperand(1);
31586 if (N0.getOperand(1) == Mask)
31587 Y = N0.getOperand(0);
31589 // Check to see if the mask appeared in both the AND and ANDNP.
31593 // Validate that X, Y, and Mask are bitcasts, and see through them.
31594 Mask = peekThroughBitcasts(Mask);
31595 X = peekThroughBitcasts(X);
31596 Y = peekThroughBitcasts(Y);
31598 EVT MaskVT = Mask.getValueType();
31600 // Validate that the Mask operand is a vector sra node.
31601 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
31602 // there is no psrai.b
31603 unsigned EltBits = MaskVT.getScalarSizeInBits();
31604 unsigned SraAmt = ~0;
31605 if (Mask.getOpcode() == ISD::SRA) {
31606 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
31607 if (auto *AmtConst = AmtBV->getConstantSplatNode())
31608 SraAmt = AmtConst->getZExtValue();
31609 } else if (Mask.getOpcode() == X86ISD::VSRAI)
31610 SraAmt = Mask.getConstantOperandVal(1);
31612 if ((SraAmt + 1) != EltBits)
31618 // (or (and (M, (sub 0, X)), (pandn M, X)))
31619 // which is a special case of vselect:
31620 // (vselect M, (sub 0, X), X)
31622 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
31623 // We know that, if fNegate is 0 or 1:
31624 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
31626 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
31627 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
31628 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
31629 // This lets us transform our vselect to:
31630 // (add (xor X, M), (and M, 1))
31632 // (sub (xor X, M), M)
31633 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
31634 auto IsNegV = [](SDNode *N, SDValue V) {
31635 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
31636 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
31639 if (IsNegV(Y.getNode(), X))
31641 else if (IsNegV(X.getNode(), Y))
31645 if (EltBits != 8 && EltBits != 16 && EltBits != 32)
31648 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
31649 SDValue SubOp2 = Mask;
31651 // If the negate was on the false side of the select, then
31652 // the operands of the SUB need to be swapped. PR 27251.
31653 // This is because the pattern being matched above is
31654 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
31655 // but if the pattern matched was
31656 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
31657 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
31658 // pattern also needs to be a negation of the replacement pattern above.
31659 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
31660 // sub accomplishes the negation of the replacement pattern.
31662 std::swap(SubOp1, SubOp2);
31664 return DAG.getBitcast(VT,
31665 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
31669 // PBLENDVB is only available on SSE 4.1.
31670 if (!Subtarget.hasSSE41())
31673 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
31675 X = DAG.getBitcast(BlendVT, X);
31676 Y = DAG.getBitcast(BlendVT, Y);
31677 Mask = DAG.getBitcast(BlendVT, Mask);
31678 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
31679 return DAG.getBitcast(VT, Mask);
31682 // Helper function for combineOrCmpEqZeroToCtlzSrl
31686 // srl(ctlz x), log2(bitsize(x))
31687 // Input pattern is checked by caller.
31688 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
31689 SelectionDAG &DAG) {
31690 SDValue Cmp = Op.getOperand(1);
31691 EVT VT = Cmp.getOperand(0).getValueType();
31692 unsigned Log2b = Log2_32(VT.getSizeInBits());
31694 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
31695 // The result of the shift is true or false, and on X86, the 32-bit
31696 // encoding of shr and lzcnt is more desirable.
31697 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
31698 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
31699 DAG.getConstant(Log2b, dl, VT));
31700 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
31703 // Try to transform:
31704 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
31706 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
31707 // Will also attempt to match more generic cases, eg:
31708 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
31709 // Only applies if the target supports the FastLZCNT feature.
31710 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
31711 TargetLowering::DAGCombinerInfo &DCI,
31712 const X86Subtarget &Subtarget) {
31713 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
31716 auto isORCandidate = [](SDValue N) {
31717 return (N->getOpcode() == ISD::OR && N->hasOneUse());
31720 // Check the zero extend is extending to 32-bit or more. The code generated by
31721 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
31722 // instructions to clear the upper bits.
31723 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
31724 !isORCandidate(N->getOperand(0)))
31727 // Check the node matches: setcc(eq, cmp 0)
31728 auto isSetCCCandidate = [](SDValue N) {
31729 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
31730 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
31731 N->getOperand(1).getOpcode() == X86ISD::CMP &&
31732 isNullConstant(N->getOperand(1).getOperand(1)) &&
31733 N->getOperand(1).getValueType().bitsGE(MVT::i32);
31736 SDNode *OR = N->getOperand(0).getNode();
31737 SDValue LHS = OR->getOperand(0);
31738 SDValue RHS = OR->getOperand(1);
31740 // Save nodes matching or(or, setcc(eq, cmp 0)).
31741 SmallVector<SDNode *, 2> ORNodes;
31742 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
31743 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
31744 ORNodes.push_back(OR);
31745 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
31746 LHS = OR->getOperand(0);
31747 RHS = OR->getOperand(1);
31750 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
31751 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
31752 !isORCandidate(SDValue(OR, 0)))
31755 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
31757 // or(srl(ctlz),srl(ctlz)).
31758 // The dag combiner can then fold it into:
31759 // srl(or(ctlz, ctlz)).
31760 EVT VT = OR->getValueType(0);
31761 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31762 SDValue Ret, NewRHS;
31763 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31764 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31769 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31770 while (ORNodes.size() > 0) {
31771 OR = ORNodes.pop_back_val();
31772 LHS = OR->getOperand(0);
31773 RHS = OR->getOperand(1);
31774 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31775 if (RHS->getOpcode() == ISD::OR)
31776 std::swap(LHS, RHS);
31777 EVT VT = OR->getValueType(0);
31778 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31781 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31785 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31790 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31791 TargetLowering::DAGCombinerInfo &DCI,
31792 const X86Subtarget &Subtarget) {
31793 if (DCI.isBeforeLegalizeOps())
31796 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31799 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31802 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31805 SDValue N0 = N->getOperand(0);
31806 SDValue N1 = N->getOperand(1);
31807 EVT VT = N->getValueType(0);
31809 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31812 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31813 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31815 // SHLD/SHRD instructions have lower register pressure, but on some
31816 // platforms they have higher latency than the equivalent
31817 // series of shifts/or that would otherwise be generated.
31818 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31819 // have higher latencies and we are not optimizing for size.
31820 if (!OptForSize && Subtarget.isSHLDSlow())
31823 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31825 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31827 if (!N0.hasOneUse() || !N1.hasOneUse())
31830 SDValue ShAmt0 = N0.getOperand(1);
31831 if (ShAmt0.getValueType() != MVT::i8)
31833 SDValue ShAmt1 = N1.getOperand(1);
31834 if (ShAmt1.getValueType() != MVT::i8)
31836 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31837 ShAmt0 = ShAmt0.getOperand(0);
31838 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31839 ShAmt1 = ShAmt1.getOperand(0);
31842 unsigned Opc = X86ISD::SHLD;
31843 SDValue Op0 = N0.getOperand(0);
31844 SDValue Op1 = N1.getOperand(0);
31845 if (ShAmt0.getOpcode() == ISD::SUB ||
31846 ShAmt0.getOpcode() == ISD::XOR) {
31847 Opc = X86ISD::SHRD;
31848 std::swap(Op0, Op1);
31849 std::swap(ShAmt0, ShAmt1);
31852 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31853 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31854 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31855 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31856 unsigned Bits = VT.getSizeInBits();
31857 if (ShAmt1.getOpcode() == ISD::SUB) {
31858 SDValue Sum = ShAmt1.getOperand(0);
31859 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31860 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31861 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
31862 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
31863 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
31864 return DAG.getNode(Opc, DL, VT,
31866 DAG.getNode(ISD::TRUNCATE, DL,
31869 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
31870 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
31871 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
31872 return DAG.getNode(Opc, DL, VT,
31873 N0.getOperand(0), N1.getOperand(0),
31874 DAG.getNode(ISD::TRUNCATE, DL,
31876 } else if (ShAmt1.getOpcode() == ISD::XOR) {
31877 SDValue Mask = ShAmt1.getOperand(1);
31878 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
31879 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
31880 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
31881 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
31882 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
31883 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
31884 if (Op1.getOpcode() == InnerShift &&
31885 isa<ConstantSDNode>(Op1.getOperand(1)) &&
31886 Op1.getConstantOperandVal(1) == 1) {
31887 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31888 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31890 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
31891 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
31892 Op1.getOperand(0) == Op1.getOperand(1)) {
31893 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31894 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31903 /// Generate NEG and CMOV for integer abs.
31904 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
31905 EVT VT = N->getValueType(0);
31907 // Since X86 does not have CMOV for 8-bit integer, we don't convert
31908 // 8-bit integer abs to NEG and CMOV.
31909 if (VT.isInteger() && VT.getSizeInBits() == 8)
31912 SDValue N0 = N->getOperand(0);
31913 SDValue N1 = N->getOperand(1);
31916 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
31917 // and change it to SUB and CMOV.
31918 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
31919 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
31920 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
31921 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
31922 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
31923 // Generate SUB & CMOV.
31924 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
31925 DAG.getConstant(0, DL, VT), N0.getOperand(0));
31926 SDValue Ops[] = {N0.getOperand(0), Neg,
31927 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
31928 SDValue(Neg.getNode(), 1)};
31929 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
31935 /// Try to turn tests against the signbit in the form of:
31936 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
31939 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
31940 // This is only worth doing if the output type is i8 or i1.
31941 EVT ResultType = N->getValueType(0);
31942 if (ResultType != MVT::i8 && ResultType != MVT::i1)
31945 SDValue N0 = N->getOperand(0);
31946 SDValue N1 = N->getOperand(1);
31948 // We should be performing an xor against a truncated shift.
31949 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
31952 // Make sure we are performing an xor against one.
31953 if (!isOneConstant(N1))
31956 // SetCC on x86 zero extends so only act on this if it's a logical shift.
31957 SDValue Shift = N0.getOperand(0);
31958 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
31961 // Make sure we are truncating from one of i16, i32 or i64.
31962 EVT ShiftTy = Shift.getValueType();
31963 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
31966 // Make sure the shift amount extracts the sign bit.
31967 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
31968 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
31971 // Create a greater-than comparison against -1.
31972 // N.B. Using SETGE against 0 works but we want a canonical looking
31973 // comparison, using SETGT matches up with what TranslateX86CC.
31975 SDValue ShiftOp = Shift.getOperand(0);
31976 EVT ShiftOpTy = ShiftOp.getValueType();
31977 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31978 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
31979 *DAG.getContext(), ResultType);
31980 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
31981 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
31982 if (SetCCResultType != ResultType)
31983 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
31987 /// Turn vector tests of the signbit in the form of:
31988 /// xor (sra X, elt_size(X)-1), -1
31992 /// This should be called before type legalization because the pattern may not
31993 /// persist after that.
31994 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
31995 const X86Subtarget &Subtarget) {
31996 EVT VT = N->getValueType(0);
31997 if (!VT.isSimple())
32000 switch (VT.getSimpleVT().SimpleTy) {
32001 default: return SDValue();
32004 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32005 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32009 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32012 // There must be a shift right algebraic before the xor, and the xor must be a
32013 // 'not' operation.
32014 SDValue Shift = N->getOperand(0);
32015 SDValue Ones = N->getOperand(1);
32016 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32017 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32020 // The shift should be smearing the sign bit across each vector element.
32021 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32025 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32026 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32027 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32030 // Create a greater-than comparison against -1. We don't use the more obvious
32031 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32032 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32035 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32036 /// is valid for the given \p Subtarget.
32037 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32038 const X86Subtarget &Subtarget) {
32039 if (!Subtarget.hasAVX512())
32042 // FIXME: Scalar type may be supported if we move it to vector register.
32043 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32046 EVT SrcElVT = SrcVT.getScalarType();
32047 EVT DstElVT = DstVT.getScalarType();
32048 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32050 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32052 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32053 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32057 /// Detect a pattern of truncation with saturation:
32058 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32059 /// Return the source value to be truncated or SDValue() if the pattern was not
32061 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32062 if (In.getOpcode() != ISD::UMIN)
32065 //Saturation with truncation. We truncate from InVT to VT.
32066 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32067 "Unexpected types for truncate operation");
32070 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32071 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32072 // the element size of the destination type.
32073 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32079 /// Detect a pattern of truncation with saturation:
32080 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32081 /// The types should allow to use VPMOVUS* instruction on AVX512.
32082 /// Return the source value to be truncated or SDValue() if the pattern was not
32084 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32085 const X86Subtarget &Subtarget) {
32086 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32088 return detectUSatPattern(In, VT);
32092 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32093 const X86Subtarget &Subtarget) {
32094 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32095 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32097 if (auto USatVal = detectUSatPattern(In, VT))
32098 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32099 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32103 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32104 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32105 /// X86ISD::AVG instruction.
32106 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32107 const X86Subtarget &Subtarget,
32109 if (!VT.isVector() || !VT.isSimple())
32111 EVT InVT = In.getValueType();
32112 unsigned NumElems = VT.getVectorNumElements();
32114 EVT ScalarVT = VT.getVectorElementType();
32115 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32116 isPowerOf2_32(NumElems)))
32119 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32120 // than the original input type (i8/i16).
32121 EVT InScalarVT = InVT.getVectorElementType();
32122 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32125 if (!Subtarget.hasSSE2())
32127 if (Subtarget.hasBWI()) {
32128 if (VT.getSizeInBits() > 512)
32130 } else if (Subtarget.hasAVX2()) {
32131 if (VT.getSizeInBits() > 256)
32134 if (VT.getSizeInBits() > 128)
32138 // Detect the following pattern:
32140 // %1 = zext <N x i8> %a to <N x i32>
32141 // %2 = zext <N x i8> %b to <N x i32>
32142 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32143 // %4 = add nuw nsw <N x i32> %3, %2
32144 // %5 = lshr <N x i32> %N, <i32 1 x N>
32145 // %6 = trunc <N x i32> %5 to <N x i8>
32147 // In AVX512, the last instruction can also be a trunc store.
32149 if (In.getOpcode() != ISD::SRL)
32152 // A lambda checking the given SDValue is a constant vector and each element
32153 // is in the range [Min, Max].
32154 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32155 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32156 if (!BV || !BV->isConstant())
32158 for (SDValue Op : V->ops()) {
32159 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32162 uint64_t Val = C->getZExtValue();
32163 if (Val < Min || Val > Max)
32169 // Check if each element of the vector is left-shifted by one.
32170 auto LHS = In.getOperand(0);
32171 auto RHS = In.getOperand(1);
32172 if (!IsConstVectorInRange(RHS, 1, 1))
32174 if (LHS.getOpcode() != ISD::ADD)
32177 // Detect a pattern of a + b + 1 where the order doesn't matter.
32178 SDValue Operands[3];
32179 Operands[0] = LHS.getOperand(0);
32180 Operands[1] = LHS.getOperand(1);
32182 // Take care of the case when one of the operands is a constant vector whose
32183 // element is in the range [1, 256].
32184 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32185 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32186 Operands[0].getOperand(0).getValueType() == VT) {
32187 // The pattern is detected. Subtract one from the constant vector, then
32188 // demote it and emit X86ISD::AVG instruction.
32189 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32190 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32191 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32192 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32196 if (Operands[0].getOpcode() == ISD::ADD)
32197 std::swap(Operands[0], Operands[1]);
32198 else if (Operands[1].getOpcode() != ISD::ADD)
32200 Operands[2] = Operands[1].getOperand(0);
32201 Operands[1] = Operands[1].getOperand(1);
32203 // Now we have three operands of two additions. Check that one of them is a
32204 // constant vector with ones, and the other two are promoted from i8/i16.
32205 for (int i = 0; i < 3; ++i) {
32206 if (!IsConstVectorInRange(Operands[i], 1, 1))
32208 std::swap(Operands[i], Operands[2]);
32210 // Check if Operands[0] and Operands[1] are results of type promotion.
32211 for (int j = 0; j < 2; ++j)
32212 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32213 Operands[j].getOperand(0).getValueType() != VT)
32216 // The pattern is detected, emit X86ISD::AVG instruction.
32217 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32218 Operands[1].getOperand(0));
32224 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32225 TargetLowering::DAGCombinerInfo &DCI,
32226 const X86Subtarget &Subtarget) {
32227 LoadSDNode *Ld = cast<LoadSDNode>(N);
32228 EVT RegVT = Ld->getValueType(0);
32229 EVT MemVT = Ld->getMemoryVT();
32231 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32233 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32234 // into two 16-byte operations.
32235 ISD::LoadExtType Ext = Ld->getExtensionType();
32237 unsigned AddressSpace = Ld->getAddressSpace();
32238 unsigned Alignment = Ld->getAlignment();
32239 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32240 Ext == ISD::NON_EXTLOAD &&
32241 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32242 AddressSpace, Alignment, &Fast) && !Fast) {
32243 unsigned NumElems = RegVT.getVectorNumElements();
32247 SDValue Ptr = Ld->getBasePtr();
32249 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32252 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32253 Alignment, Ld->getMemOperand()->getFlags());
32255 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32257 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32258 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32259 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32261 Load2.getValue(1));
32263 SDValue NewVec = DAG.getUNDEF(RegVT);
32264 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32265 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32266 return DCI.CombineTo(N, NewVec, TF, true);
32272 /// If V is a build vector of boolean constants and exactly one of those
32273 /// constants is true, return the operand index of that true element.
32274 /// Otherwise, return -1.
32275 static int getOneTrueElt(SDValue V) {
32276 // This needs to be a build vector of booleans.
32277 // TODO: Checking for the i1 type matches the IR definition for the mask,
32278 // but the mask check could be loosened to i8 or other types. That might
32279 // also require checking more than 'allOnesValue'; eg, the x86 HW
32280 // instructions only require that the MSB is set for each mask element.
32281 // The ISD::MSTORE comments/definition do not specify how the mask operand
32283 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32284 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32287 int TrueIndex = -1;
32288 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32289 for (unsigned i = 0; i < NumElts; ++i) {
32290 const SDValue &Op = BV->getOperand(i);
32293 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32296 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32297 // If we already found a one, this is too many.
32298 if (TrueIndex >= 0)
32306 /// Given a masked memory load/store operation, return true if it has one mask
32307 /// bit set. If it has one mask bit set, then also return the memory address of
32308 /// the scalar element to load/store, the vector index to insert/extract that
32309 /// scalar element, and the alignment for the scalar memory access.
32310 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32311 SelectionDAG &DAG, SDValue &Addr,
32312 SDValue &Index, unsigned &Alignment) {
32313 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32314 if (TrueMaskElt < 0)
32317 // Get the address of the one scalar element that is specified by the mask
32318 // using the appropriate offset from the base pointer.
32319 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32320 Addr = MaskedOp->getBasePtr();
32321 if (TrueMaskElt != 0) {
32322 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32323 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32326 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32327 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32331 /// If exactly one element of the mask is set for a non-extending masked load,
32332 /// it is a scalar load and vector insert.
32333 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32334 /// mask have already been optimized in IR, so we don't bother with those here.
32336 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32337 TargetLowering::DAGCombinerInfo &DCI) {
32338 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32339 // However, some target hooks may need to be added to know when the transform
32340 // is profitable. Endianness would also have to be considered.
32342 SDValue Addr, VecIndex;
32343 unsigned Alignment;
32344 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32347 // Load the one scalar element that is specified by the mask using the
32348 // appropriate offset from the base pointer.
32350 EVT VT = ML->getValueType(0);
32351 EVT EltVT = VT.getVectorElementType();
32353 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32354 Alignment, ML->getMemOperand()->getFlags());
32356 // Insert the loaded element into the appropriate place in the vector.
32357 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32359 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32363 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32364 TargetLowering::DAGCombinerInfo &DCI) {
32365 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32369 EVT VT = ML->getValueType(0);
32371 // If we are loading the first and last elements of a vector, it is safe and
32372 // always faster to load the whole vector. Replace the masked load with a
32373 // vector load and select.
32374 unsigned NumElts = VT.getVectorNumElements();
32375 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32376 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32377 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32378 if (LoadFirstElt && LoadLastElt) {
32379 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32380 ML->getMemOperand());
32381 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32382 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32385 // Convert a masked load with a constant mask into a masked load and a select.
32386 // This allows the select operation to use a faster kind of select instruction
32387 // (for example, vblendvps -> vblendps).
32389 // Don't try this if the pass-through operand is already undefined. That would
32390 // cause an infinite loop because that's what we're about to create.
32391 if (ML->getSrc0().isUndef())
32394 // The new masked load has an undef pass-through operand. The select uses the
32395 // original pass-through operand.
32396 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32397 ML->getMask(), DAG.getUNDEF(VT),
32398 ML->getMemoryVT(), ML->getMemOperand(),
32399 ML->getExtensionType());
32400 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32402 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32405 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32406 TargetLowering::DAGCombinerInfo &DCI,
32407 const X86Subtarget &Subtarget) {
32408 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32410 // TODO: Expanding load with constant mask may be optimized as well.
32411 if (Mld->isExpandingLoad())
32414 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32415 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32417 // TODO: Do some AVX512 subsets benefit from this transform?
32418 if (!Subtarget.hasAVX512())
32419 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32423 if (Mld->getExtensionType() != ISD::SEXTLOAD)
32426 // Resolve extending loads.
32427 EVT VT = Mld->getValueType(0);
32428 unsigned NumElems = VT.getVectorNumElements();
32429 EVT LdVT = Mld->getMemoryVT();
32432 assert(LdVT != VT && "Cannot extend to the same type");
32433 unsigned ToSz = VT.getScalarSizeInBits();
32434 unsigned FromSz = LdVT.getScalarSizeInBits();
32435 // From/To sizes and ElemCount must be pow of two.
32436 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32437 "Unexpected size for extending masked load");
32439 unsigned SizeRatio = ToSz / FromSz;
32440 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32442 // Create a type on which we perform the shuffle.
32443 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32444 LdVT.getScalarType(), NumElems*SizeRatio);
32445 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32447 // Convert Src0 value.
32448 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32449 if (!Mld->getSrc0().isUndef()) {
32450 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32451 for (unsigned i = 0; i != NumElems; ++i)
32452 ShuffleVec[i] = i * SizeRatio;
32454 // Can't shuffle using an illegal type.
32455 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32456 "WideVecVT should be legal");
32457 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32458 DAG.getUNDEF(WideVecVT), ShuffleVec);
32460 // Prepare the new mask.
32462 SDValue Mask = Mld->getMask();
32463 if (Mask.getValueType() == VT) {
32464 // Mask and original value have the same type.
32465 NewMask = DAG.getBitcast(WideVecVT, Mask);
32466 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32467 for (unsigned i = 0; i != NumElems; ++i)
32468 ShuffleVec[i] = i * SizeRatio;
32469 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32470 ShuffleVec[i] = NumElems * SizeRatio;
32471 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32472 DAG.getConstant(0, dl, WideVecVT),
32475 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32476 unsigned WidenNumElts = NumElems*SizeRatio;
32477 unsigned MaskNumElts = VT.getVectorNumElements();
32478 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32481 unsigned NumConcat = WidenNumElts / MaskNumElts;
32482 SmallVector<SDValue, 16> Ops(NumConcat);
32483 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32485 for (unsigned i = 1; i != NumConcat; ++i)
32488 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32491 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32492 Mld->getBasePtr(), NewMask, WideSrc0,
32493 Mld->getMemoryVT(), Mld->getMemOperand(),
32495 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32496 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32499 /// If exactly one element of the mask is set for a non-truncating masked store,
32500 /// it is a vector extract and scalar store.
32501 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32502 /// mask have already been optimized in IR, so we don't bother with those here.
32503 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32504 SelectionDAG &DAG) {
32505 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32506 // However, some target hooks may need to be added to know when the transform
32507 // is profitable. Endianness would also have to be considered.
32509 SDValue Addr, VecIndex;
32510 unsigned Alignment;
32511 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32514 // Extract the one scalar element that is actually being stored.
32516 EVT VT = MS->getValue().getValueType();
32517 EVT EltVT = VT.getVectorElementType();
32518 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32519 MS->getValue(), VecIndex);
32521 // Store that element at the appropriate offset from the base pointer.
32522 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32523 Alignment, MS->getMemOperand()->getFlags());
32526 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32527 const X86Subtarget &Subtarget) {
32528 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32530 if (Mst->isCompressingStore())
32533 if (!Mst->isTruncatingStore())
32534 return reduceMaskedStoreToScalarStore(Mst, DAG);
32536 // Resolve truncating stores.
32537 EVT VT = Mst->getValue().getValueType();
32538 unsigned NumElems = VT.getVectorNumElements();
32539 EVT StVT = Mst->getMemoryVT();
32542 assert(StVT != VT && "Cannot truncate to the same type");
32543 unsigned FromSz = VT.getScalarSizeInBits();
32544 unsigned ToSz = StVT.getScalarSizeInBits();
32546 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32548 // The truncating store is legal in some cases. For example
32549 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32550 // are designated for truncate store.
32551 // In this case we don't need any further transformations.
32552 if (TLI.isTruncStoreLegal(VT, StVT))
32555 // From/To sizes and ElemCount must be pow of two.
32556 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32557 "Unexpected size for truncating masked store");
32558 // We are going to use the original vector elt for storing.
32559 // Accumulated smaller vector elements must be a multiple of the store size.
32560 assert (((NumElems * FromSz) % ToSz) == 0 &&
32561 "Unexpected ratio for truncating masked store");
32563 unsigned SizeRatio = FromSz / ToSz;
32564 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32566 // Create a type on which we perform the shuffle.
32567 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32568 StVT.getScalarType(), NumElems*SizeRatio);
32570 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32572 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
32573 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32574 for (unsigned i = 0; i != NumElems; ++i)
32575 ShuffleVec[i] = i * SizeRatio;
32577 // Can't shuffle using an illegal type.
32578 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32579 "WideVecVT should be legal");
32581 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32582 DAG.getUNDEF(WideVecVT),
32586 SDValue Mask = Mst->getMask();
32587 if (Mask.getValueType() == VT) {
32588 // Mask and original value have the same type.
32589 NewMask = DAG.getBitcast(WideVecVT, Mask);
32590 for (unsigned i = 0; i != NumElems; ++i)
32591 ShuffleVec[i] = i * SizeRatio;
32592 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
32593 ShuffleVec[i] = NumElems*SizeRatio;
32594 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32595 DAG.getConstant(0, dl, WideVecVT),
32598 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32599 unsigned WidenNumElts = NumElems*SizeRatio;
32600 unsigned MaskNumElts = VT.getVectorNumElements();
32601 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32604 unsigned NumConcat = WidenNumElts / MaskNumElts;
32605 SmallVector<SDValue, 16> Ops(NumConcat);
32606 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32608 for (unsigned i = 1; i != NumConcat; ++i)
32611 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32614 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
32615 Mst->getBasePtr(), NewMask, StVT,
32616 Mst->getMemOperand(), false);
32619 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
32620 const X86Subtarget &Subtarget) {
32621 StoreSDNode *St = cast<StoreSDNode>(N);
32622 EVT VT = St->getValue().getValueType();
32623 EVT StVT = St->getMemoryVT();
32625 SDValue StoredVal = St->getOperand(1);
32626 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32628 // If we are saving a concatenation of two XMM registers and 32-byte stores
32629 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
32631 unsigned AddressSpace = St->getAddressSpace();
32632 unsigned Alignment = St->getAlignment();
32633 if (VT.is256BitVector() && StVT == VT &&
32634 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
32635 AddressSpace, Alignment, &Fast) &&
32637 unsigned NumElems = VT.getVectorNumElements();
32641 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
32642 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
32644 SDValue Ptr0 = St->getBasePtr();
32645 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
32648 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
32649 Alignment, St->getMemOperand()->getFlags());
32651 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
32652 std::min(16U, Alignment), St->getMemOperand()->getFlags());
32653 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
32656 // Optimize trunc store (of multiple scalars) to shuffle and store.
32657 // First, pack all of the elements in one place. Next, store to memory
32658 // in fewer chunks.
32659 if (St->isTruncatingStore() && VT.isVector()) {
32660 // Check if we can detect an AVG pattern from the truncation. If yes,
32661 // replace the trunc store by a normal store with the result of X86ISD::AVG
32663 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
32665 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
32666 St->getPointerInfo(), St->getAlignment(),
32667 St->getMemOperand()->getFlags());
32670 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
32671 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
32672 dl, Val, St->getBasePtr(),
32673 St->getMemoryVT(), St->getMemOperand(), DAG);
32675 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32676 unsigned NumElems = VT.getVectorNumElements();
32677 assert(StVT != VT && "Cannot truncate to the same type");
32678 unsigned FromSz = VT.getScalarSizeInBits();
32679 unsigned ToSz = StVT.getScalarSizeInBits();
32681 // The truncating store is legal in some cases. For example
32682 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32683 // are designated for truncate store.
32684 // In this case we don't need any further transformations.
32685 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
32688 // From, To sizes and ElemCount must be pow of two
32689 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
32690 // We are going to use the original vector elt for storing.
32691 // Accumulated smaller vector elements must be a multiple of the store size.
32692 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
32694 unsigned SizeRatio = FromSz / ToSz;
32696 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32698 // Create a type on which we perform the shuffle
32699 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32700 StVT.getScalarType(), NumElems*SizeRatio);
32702 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32704 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
32705 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
32706 for (unsigned i = 0; i != NumElems; ++i)
32707 ShuffleVec[i] = i * SizeRatio;
32709 // Can't shuffle using an illegal type.
32710 if (!TLI.isTypeLegal(WideVecVT))
32713 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32714 DAG.getUNDEF(WideVecVT),
32716 // At this point all of the data is stored at the bottom of the
32717 // register. We now need to save it to mem.
32719 // Find the largest store unit
32720 MVT StoreType = MVT::i8;
32721 for (MVT Tp : MVT::integer_valuetypes()) {
32722 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
32726 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
32727 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
32728 (64 <= NumElems * ToSz))
32729 StoreType = MVT::f64;
32731 // Bitcast the original vector into a vector of store-size units
32732 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
32733 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
32734 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
32735 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
32736 SmallVector<SDValue, 8> Chains;
32737 SDValue Ptr = St->getBasePtr();
32739 // Perform one or more big stores into memory.
32740 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32741 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32742 StoreType, ShuffWide,
32743 DAG.getIntPtrConstant(i, dl));
32745 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32746 St->getAlignment(), St->getMemOperand()->getFlags());
32747 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32748 Chains.push_back(Ch);
32751 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32754 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
32755 // the FP state in cases where an emms may be missing.
32756 // A preferable solution to the general problem is to figure out the right
32757 // places to insert EMMS. This qualifies as a quick hack.
32759 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32760 if (VT.getSizeInBits() != 64)
32763 const Function *F = DAG.getMachineFunction().getFunction();
32764 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32766 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32767 if ((VT.isVector() ||
32768 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32769 isa<LoadSDNode>(St->getValue()) &&
32770 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32771 St->getChain().hasOneUse() && !St->isVolatile()) {
32772 SDNode* LdVal = St->getValue().getNode();
32773 LoadSDNode *Ld = nullptr;
32774 int TokenFactorIndex = -1;
32775 SmallVector<SDValue, 8> Ops;
32776 SDNode* ChainVal = St->getChain().getNode();
32777 // Must be a store of a load. We currently handle two cases: the load
32778 // is a direct child, and it's under an intervening TokenFactor. It is
32779 // possible to dig deeper under nested TokenFactors.
32780 if (ChainVal == LdVal)
32781 Ld = cast<LoadSDNode>(St->getChain());
32782 else if (St->getValue().hasOneUse() &&
32783 ChainVal->getOpcode() == ISD::TokenFactor) {
32784 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32785 if (ChainVal->getOperand(i).getNode() == LdVal) {
32786 TokenFactorIndex = i;
32787 Ld = cast<LoadSDNode>(St->getValue());
32789 Ops.push_back(ChainVal->getOperand(i));
32793 if (!Ld || !ISD::isNormalLoad(Ld))
32796 // If this is not the MMX case, i.e. we are just turning i64 load/store
32797 // into f64 load/store, avoid the transformation if there are multiple
32798 // uses of the loaded value.
32799 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32804 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32805 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32807 if (Subtarget.is64Bit() || F64IsLegal) {
32808 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32809 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32810 Ld->getPointerInfo(), Ld->getAlignment(),
32811 Ld->getMemOperand()->getFlags());
32812 SDValue NewChain = NewLd.getValue(1);
32813 if (TokenFactorIndex >= 0) {
32814 Ops.push_back(NewChain);
32815 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32817 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32818 St->getPointerInfo(), St->getAlignment(),
32819 St->getMemOperand()->getFlags());
32822 // Otherwise, lower to two pairs of 32-bit loads / stores.
32823 SDValue LoAddr = Ld->getBasePtr();
32824 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32826 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32827 Ld->getPointerInfo(), Ld->getAlignment(),
32828 Ld->getMemOperand()->getFlags());
32829 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32830 Ld->getPointerInfo().getWithOffset(4),
32831 MinAlign(Ld->getAlignment(), 4),
32832 Ld->getMemOperand()->getFlags());
32834 SDValue NewChain = LoLd.getValue(1);
32835 if (TokenFactorIndex >= 0) {
32836 Ops.push_back(LoLd);
32837 Ops.push_back(HiLd);
32838 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32841 LoAddr = St->getBasePtr();
32842 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32845 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32846 St->getAlignment(), St->getMemOperand()->getFlags());
32847 SDValue HiSt = DAG.getStore(
32848 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32849 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32850 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32853 // This is similar to the above case, but here we handle a scalar 64-bit
32854 // integer store that is extracted from a vector on a 32-bit target.
32855 // If we have SSE2, then we can treat it like a floating-point double
32856 // to get past legalization. The execution dependencies fixup pass will
32857 // choose the optimal machine instruction for the store if this really is
32858 // an integer or v2f32 rather than an f64.
32859 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32860 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32861 SDValue OldExtract = St->getOperand(1);
32862 SDValue ExtOp0 = OldExtract.getOperand(0);
32863 unsigned VecSize = ExtOp0.getValueSizeInBits();
32864 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
32865 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
32866 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
32867 BitCast, OldExtract.getOperand(1));
32868 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
32869 St->getPointerInfo(), St->getAlignment(),
32870 St->getMemOperand()->getFlags());
32876 /// Return 'true' if this vector operation is "horizontal"
32877 /// and return the operands for the horizontal operation in LHS and RHS. A
32878 /// horizontal operation performs the binary operation on successive elements
32879 /// of its first operand, then on successive elements of its second operand,
32880 /// returning the resulting values in a vector. For example, if
32881 /// A = < float a0, float a1, float a2, float a3 >
32883 /// B = < float b0, float b1, float b2, float b3 >
32884 /// then the result of doing a horizontal operation on A and B is
32885 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
32886 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
32887 /// A horizontal-op B, for some already available A and B, and if so then LHS is
32888 /// set to A, RHS to B, and the routine returns 'true'.
32889 /// Note that the binary operation should have the property that if one of the
32890 /// operands is UNDEF then the result is UNDEF.
32891 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
32892 // Look for the following pattern: if
32893 // A = < float a0, float a1, float a2, float a3 >
32894 // B = < float b0, float b1, float b2, float b3 >
32896 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
32897 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
32898 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
32899 // which is A horizontal-op B.
32901 // At least one of the operands should be a vector shuffle.
32902 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
32903 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
32906 MVT VT = LHS.getSimpleValueType();
32908 assert((VT.is128BitVector() || VT.is256BitVector()) &&
32909 "Unsupported vector type for horizontal add/sub");
32911 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
32912 // operate independently on 128-bit lanes.
32913 unsigned NumElts = VT.getVectorNumElements();
32914 unsigned NumLanes = VT.getSizeInBits()/128;
32915 unsigned NumLaneElts = NumElts / NumLanes;
32916 assert((NumLaneElts % 2 == 0) &&
32917 "Vector type should have an even number of elements in each lane");
32918 unsigned HalfLaneElts = NumLaneElts/2;
32920 // View LHS in the form
32921 // LHS = VECTOR_SHUFFLE A, B, LMask
32922 // If LHS is not a shuffle then pretend it is the shuffle
32923 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
32924 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
32927 SmallVector<int, 16> LMask(NumElts);
32928 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
32929 if (!LHS.getOperand(0).isUndef())
32930 A = LHS.getOperand(0);
32931 if (!LHS.getOperand(1).isUndef())
32932 B = LHS.getOperand(1);
32933 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
32934 std::copy(Mask.begin(), Mask.end(), LMask.begin());
32936 if (!LHS.isUndef())
32938 for (unsigned i = 0; i != NumElts; ++i)
32942 // Likewise, view RHS in the form
32943 // RHS = VECTOR_SHUFFLE C, D, RMask
32945 SmallVector<int, 16> RMask(NumElts);
32946 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
32947 if (!RHS.getOperand(0).isUndef())
32948 C = RHS.getOperand(0);
32949 if (!RHS.getOperand(1).isUndef())
32950 D = RHS.getOperand(1);
32951 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
32952 std::copy(Mask.begin(), Mask.end(), RMask.begin());
32954 if (!RHS.isUndef())
32956 for (unsigned i = 0; i != NumElts; ++i)
32960 // Check that the shuffles are both shuffling the same vectors.
32961 if (!(A == C && B == D) && !(A == D && B == C))
32964 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
32965 if (!A.getNode() && !B.getNode())
32968 // If A and B occur in reverse order in RHS, then "swap" them (which means
32969 // rewriting the mask).
32971 ShuffleVectorSDNode::commuteMask(RMask);
32973 // At this point LHS and RHS are equivalent to
32974 // LHS = VECTOR_SHUFFLE A, B, LMask
32975 // RHS = VECTOR_SHUFFLE A, B, RMask
32976 // Check that the masks correspond to performing a horizontal operation.
32977 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
32978 for (unsigned i = 0; i != NumLaneElts; ++i) {
32979 int LIdx = LMask[i+l], RIdx = RMask[i+l];
32981 // Ignore any UNDEF components.
32982 if (LIdx < 0 || RIdx < 0 ||
32983 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
32984 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
32987 // Check that successive elements are being operated on. If not, this is
32988 // not a horizontal operation.
32989 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
32990 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
32991 if (!(LIdx == Index && RIdx == Index + 1) &&
32992 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
32997 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
32998 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33002 /// Do target-specific dag combines on floating-point adds/subs.
33003 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33004 const X86Subtarget &Subtarget) {
33005 EVT VT = N->getValueType(0);
33006 SDValue LHS = N->getOperand(0);
33007 SDValue RHS = N->getOperand(1);
33008 bool IsFadd = N->getOpcode() == ISD::FADD;
33009 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33011 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33012 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33013 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33014 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33015 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33016 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33021 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33023 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33024 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33025 const X86Subtarget &Subtarget,
33027 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33028 SDValue Src = N->getOperand(0);
33029 unsigned Opcode = Src.getOpcode();
33030 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33032 EVT VT = N->getValueType(0);
33033 EVT SrcVT = Src.getValueType();
33035 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33036 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33038 // Repeated operand, so we are only trading one output truncation for
33039 // one input truncation.
33043 // See if either operand has been extended from a smaller/equal size to
33044 // the truncation size, allowing a truncation to combine with the extend.
33045 unsigned Opcode0 = Op0.getOpcode();
33046 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33047 Opcode0 == ISD::ZERO_EXTEND) &&
33048 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33051 unsigned Opcode1 = Op1.getOpcode();
33052 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33053 Opcode1 == ISD::ZERO_EXTEND) &&
33054 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33057 // See if either operand is a single use constant which can be constant
33059 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33060 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33061 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33062 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33065 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33066 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33067 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33068 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33071 // Don't combine if the operation has other uses.
33072 if (!N->isOnlyUserOf(Src.getNode()))
33075 // Only support vector truncation for now.
33076 // TODO: i64 scalar math would benefit as well.
33077 if (!VT.isVector())
33080 // In most cases its only worth pre-truncating if we're only facing the cost
33081 // of one truncation.
33082 // i.e. if one of the inputs will constant fold or the input is repeated.
33087 SDValue Op0 = Src.getOperand(0);
33088 SDValue Op1 = Src.getOperand(1);
33089 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33090 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33091 return TruncateArithmetic(Op0, Op1);
33096 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33097 // better to truncate if we have the chance.
33098 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33099 !TLI.isOperationLegal(Opcode, SrcVT))
33100 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33103 SDValue Op0 = Src.getOperand(0);
33104 SDValue Op1 = Src.getOperand(1);
33105 if (TLI.isOperationLegal(Opcode, VT) &&
33106 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33107 return TruncateArithmetic(Op0, Op1);
33115 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33117 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33118 SmallVector<SDValue, 8> &Regs) {
33119 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33120 Regs[0].getValueType() == MVT::v2i64));
33121 EVT OutVT = N->getValueType(0);
33122 EVT OutSVT = OutVT.getVectorElementType();
33123 EVT InVT = Regs[0].getValueType();
33124 EVT InSVT = InVT.getVectorElementType();
33127 // First, use mask to unset all bits that won't appear in the result.
33128 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33129 "OutSVT can only be either i8 or i16.");
33131 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33132 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33133 for (auto &Reg : Regs)
33134 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33136 MVT UnpackedVT, PackedVT;
33137 if (OutSVT == MVT::i8) {
33138 UnpackedVT = MVT::v8i16;
33139 PackedVT = MVT::v16i8;
33141 UnpackedVT = MVT::v4i32;
33142 PackedVT = MVT::v8i16;
33145 // In each iteration, truncate the type by a half size.
33146 auto RegNum = Regs.size();
33147 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33148 j < e; j *= 2, RegNum /= 2) {
33149 for (unsigned i = 0; i < RegNum; i++)
33150 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33151 for (unsigned i = 0; i < RegNum / 2; i++)
33152 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33156 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33157 // then extract a subvector as the result since v8i8 is not a legal type.
33158 if (OutVT == MVT::v8i8) {
33159 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33160 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33161 DAG.getIntPtrConstant(0, DL));
33163 } else if (RegNum > 1) {
33164 Regs.resize(RegNum);
33165 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33170 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33172 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33174 SmallVector<SDValue, 8> &Regs) {
33175 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33176 EVT OutVT = N->getValueType(0);
33179 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33180 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33181 for (auto &Reg : Regs) {
33182 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33184 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33188 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33189 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33192 if (Regs.size() > 2) {
33193 Regs.resize(Regs.size() / 2);
33194 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33199 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33200 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33201 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33202 /// element that is extracted from a vector and then truncated, and it is
33203 /// difficult to do this optimization based on them.
33204 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33205 const X86Subtarget &Subtarget) {
33206 EVT OutVT = N->getValueType(0);
33207 if (!OutVT.isVector())
33210 SDValue In = N->getOperand(0);
33211 if (!In.getValueType().isSimple())
33214 EVT InVT = In.getValueType();
33215 unsigned NumElems = OutVT.getVectorNumElements();
33217 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33218 // SSE2, and we need to take care of it specially.
33219 // AVX512 provides vpmovdb.
33220 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33223 EVT OutSVT = OutVT.getVectorElementType();
33224 EVT InSVT = InVT.getVectorElementType();
33225 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33226 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33230 // SSSE3's pshufb results in less instructions in the cases below.
33231 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33232 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33233 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33238 // Split a long vector into vectors of legal type.
33239 unsigned RegNum = InVT.getSizeInBits() / 128;
33240 SmallVector<SDValue, 8> SubVec(RegNum);
33241 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33242 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33244 for (unsigned i = 0; i < RegNum; i++)
33245 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33246 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33248 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33249 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33250 // truncate 2 x v4i32 to v8i16.
33251 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33252 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33253 else if (InSVT == MVT::i32)
33254 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33259 /// This function transforms vector truncation of 'all or none' bits values.
33260 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33261 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33263 const X86Subtarget &Subtarget) {
33264 // Requires SSE2 but AVX512 has fast truncate.
33265 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33268 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33271 SDValue In = N->getOperand(0);
33272 if (!In.getValueType().isSimple())
33275 MVT VT = N->getValueType(0).getSimpleVT();
33276 MVT SVT = VT.getScalarType();
33278 MVT InVT = In.getValueType().getSimpleVT();
33279 MVT InSVT = InVT.getScalarType();
33281 // Use PACKSS if the input is a splatted sign bit.
33282 // e.g. Comparison result, sext_in_reg, etc.
33283 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33284 if (NumSignBits != InSVT.getSizeInBits())
33287 // Check we have a truncation suited for PACKSS.
33288 if (!VT.is128BitVector() && !VT.is256BitVector())
33290 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33292 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33295 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33298 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33299 const X86Subtarget &Subtarget) {
33300 EVT VT = N->getValueType(0);
33301 SDValue Src = N->getOperand(0);
33304 // Attempt to pre-truncate inputs to arithmetic ops instead.
33305 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33308 // Try to detect AVG pattern first.
33309 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33312 // Try to combine truncation with unsigned saturation.
33313 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33316 // The bitcast source is a direct mmx result.
33317 // Detect bitcasts between i32 to x86mmx
33318 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33319 SDValue BCSrc = Src.getOperand(0);
33320 if (BCSrc.getValueType() == MVT::x86mmx)
33321 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33324 // Try to truncate extended sign bits with PACKSS.
33325 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33328 return combineVectorTruncation(N, DAG, Subtarget);
33331 /// Returns the negated value if the node \p N flips sign of FP value.
33333 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33334 /// AVX512F does not have FXOR, so FNEG is lowered as
33335 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33336 /// In this case we go though all bitcasts.
33337 static SDValue isFNEG(SDNode *N) {
33338 if (N->getOpcode() == ISD::FNEG)
33339 return N->getOperand(0);
33341 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33342 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33345 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33346 if (!Op1.getValueType().isFloatingPoint())
33349 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33351 unsigned EltBits = Op1.getScalarValueSizeInBits();
33352 auto isSignMask = [&](const ConstantFP *C) {
33353 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33356 // There is more than one way to represent the same constant on
33357 // the different X86 targets. The type of the node may also depend on size.
33358 // - load scalar value and broadcast
33359 // - BUILD_VECTOR node
33360 // - load from a constant pool.
33361 // We check all variants here.
33362 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33363 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33364 if (isSignMask(cast<ConstantFP>(C)))
33367 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33368 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33369 if (isSignMask(CN->getConstantFPValue()))
33372 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33373 if (C->getType()->isVectorTy()) {
33374 if (auto *SplatV = C->getSplatValue())
33375 if (isSignMask(cast<ConstantFP>(SplatV)))
33377 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33378 if (isSignMask(FPConst))
33384 /// Do target-specific dag combines on floating point negations.
33385 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33386 const X86Subtarget &Subtarget) {
33387 EVT OrigVT = N->getValueType(0);
33388 SDValue Arg = isFNEG(N);
33389 assert(Arg.getNode() && "N is expected to be an FNEG node");
33391 EVT VT = Arg.getValueType();
33392 EVT SVT = VT.getScalarType();
33395 // Let legalize expand this if it isn't a legal type yet.
33396 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33399 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33400 // use of a constant by performing (-0 - A*B) instead.
33401 // FIXME: Check rounding control flags as well once it becomes available.
33402 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33403 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33404 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33405 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33406 Arg.getOperand(1), Zero);
33407 return DAG.getBitcast(OrigVT, NewNode);
33410 // If we're negating an FMA node, then we can adjust the
33411 // instruction to include the extra negation.
33412 unsigned NewOpcode = 0;
33413 if (Arg.hasOneUse()) {
33414 switch (Arg.getOpcode()) {
33415 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33416 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33417 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33418 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33419 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33420 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33421 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33422 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33423 // We can't handle scalar intrinsic node here because it would only
33424 // invert one element and not the whole vector. But we could try to handle
33425 // a negation of the lower element only.
33429 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33430 Arg.getNode()->ops()));
33435 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33436 const X86Subtarget &Subtarget) {
33437 MVT VT = N->getSimpleValueType(0);
33438 // If we have integer vector types available, use the integer opcodes.
33439 if (VT.isVector() && Subtarget.hasSSE2()) {
33442 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33444 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33445 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33446 unsigned IntOpcode;
33447 switch (N->getOpcode()) {
33448 default: llvm_unreachable("Unexpected FP logic op");
33449 case X86ISD::FOR: IntOpcode = ISD::OR; break;
33450 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33451 case X86ISD::FAND: IntOpcode = ISD::AND; break;
33452 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33454 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33455 return DAG.getBitcast(VT, IntOp);
33460 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33461 TargetLowering::DAGCombinerInfo &DCI,
33462 const X86Subtarget &Subtarget) {
33463 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33466 if (DCI.isBeforeLegalizeOps())
33469 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33472 if (Subtarget.hasCMov())
33473 if (SDValue RV = combineIntegerAbs(N, DAG))
33476 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33480 return combineFneg(N, DAG, Subtarget);
33485 static bool isNullFPScalarOrVectorConst(SDValue V) {
33486 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
33489 /// If a value is a scalar FP zero or a vector FP zero (potentially including
33490 /// undefined elements), return a zero constant that may be used to fold away
33491 /// that value. In the case of a vector, the returned constant will not contain
33492 /// undefined elements even if the input parameter does. This makes it suitable
33493 /// to be used as a replacement operand with operations (eg, bitwise-and) where
33494 /// an undef should not propagate.
33495 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33496 const X86Subtarget &Subtarget) {
33497 if (!isNullFPScalarOrVectorConst(V))
33500 if (V.getValueType().isVector())
33501 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33506 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33507 const X86Subtarget &Subtarget) {
33508 SDValue N0 = N->getOperand(0);
33509 SDValue N1 = N->getOperand(1);
33510 EVT VT = N->getValueType(0);
33513 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33514 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
33515 (VT == MVT::f64 && Subtarget.hasSSE2())))
33518 auto isAllOnesConstantFP = [](SDValue V) {
33519 auto *C = dyn_cast<ConstantFPSDNode>(V);
33520 return C && C->getConstantFPValue()->isAllOnesValue();
33523 // fand (fxor X, -1), Y --> fandn X, Y
33524 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33525 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33527 // fand X, (fxor Y, -1) --> fandn Y, X
33528 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33529 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33534 /// Do target-specific dag combines on X86ISD::FAND nodes.
33535 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33536 const X86Subtarget &Subtarget) {
33537 // FAND(0.0, x) -> 0.0
33538 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33541 // FAND(x, 0.0) -> 0.0
33542 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33545 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33548 return lowerX86FPLogicOp(N, DAG, Subtarget);
33551 /// Do target-specific dag combines on X86ISD::FANDN nodes.
33552 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33553 const X86Subtarget &Subtarget) {
33554 // FANDN(0.0, x) -> x
33555 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33556 return N->getOperand(1);
33558 // FANDN(x, 0.0) -> 0.0
33559 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33562 return lowerX86FPLogicOp(N, DAG, Subtarget);
33565 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
33566 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
33567 const X86Subtarget &Subtarget) {
33568 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
33570 // F[X]OR(0.0, x) -> x
33571 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33572 return N->getOperand(1);
33574 // F[X]OR(x, 0.0) -> x
33575 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
33576 return N->getOperand(0);
33579 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
33582 return lowerX86FPLogicOp(N, DAG, Subtarget);
33585 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
33586 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
33587 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
33589 // Only perform optimizations if UnsafeMath is used.
33590 if (!DAG.getTarget().Options.UnsafeFPMath)
33593 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
33594 // into FMINC and FMAXC, which are Commutative operations.
33595 unsigned NewOp = 0;
33596 switch (N->getOpcode()) {
33597 default: llvm_unreachable("unknown opcode");
33598 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
33599 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
33602 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
33603 N->getOperand(0), N->getOperand(1));
33606 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
33607 const X86Subtarget &Subtarget) {
33608 if (Subtarget.useSoftFloat())
33611 // TODO: Check for global or instruction-level "nnan". In that case, we
33612 // should be able to lower to FMAX/FMIN alone.
33613 // TODO: If an operand is already known to be a NaN or not a NaN, this
33614 // should be an optional swap and FMAX/FMIN.
33616 EVT VT = N->getValueType(0);
33617 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
33618 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
33619 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
33622 // This takes at least 3 instructions, so favor a library call when operating
33623 // on a scalar and minimizing code size.
33624 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
33627 SDValue Op0 = N->getOperand(0);
33628 SDValue Op1 = N->getOperand(1);
33630 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
33631 DAG.getDataLayout(), *DAG.getContext(), VT);
33633 // There are 4 possibilities involving NaN inputs, and these are the required
33637 // ----------------
33638 // Num | Max | Op0 |
33639 // Op0 ----------------
33640 // NaN | Op1 | NaN |
33641 // ----------------
33643 // The SSE FP max/min instructions were not designed for this case, but rather
33645 // Min = Op1 < Op0 ? Op1 : Op0
33646 // Max = Op1 > Op0 ? Op1 : Op0
33648 // So they always return Op0 if either input is a NaN. However, we can still
33649 // use those instructions for fmaxnum by selecting away a NaN input.
33651 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
33652 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
33653 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
33654 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
33656 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
33657 // are NaN, the NaN value of Op1 is the result.
33658 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
33659 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
33662 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
33663 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
33664 TargetLowering::DAGCombinerInfo &DCI,
33665 const X86Subtarget &Subtarget) {
33666 // ANDNP(0, x) -> x
33667 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
33668 return N->getOperand(1);
33670 // ANDNP(x, 0) -> 0
33671 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
33672 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
33674 EVT VT = N->getValueType(0);
33676 // Attempt to recursively combine a bitmask ANDNP with shuffles.
33677 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33679 SmallVector<int, 1> NonceMask; // Just a placeholder.
33680 NonceMask.push_back(0);
33681 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
33682 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
33684 return SDValue(); // This routine will use CombineTo to replace N.
33690 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
33691 TargetLowering::DAGCombinerInfo &DCI) {
33692 // BT ignores high bits in the bit index operand.
33693 SDValue Op1 = N->getOperand(1);
33694 if (Op1.hasOneUse()) {
33695 unsigned BitWidth = Op1.getValueSizeInBits();
33696 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
33698 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
33699 !DCI.isBeforeLegalizeOps());
33700 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33701 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
33702 TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
33703 DCI.CommitTargetLoweringOpt(TLO);
33708 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
33709 const X86Subtarget &Subtarget) {
33710 EVT VT = N->getValueType(0);
33711 if (!VT.isVector())
33714 SDValue N0 = N->getOperand(0);
33715 SDValue N1 = N->getOperand(1);
33716 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
33719 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
33720 // both SSE and AVX2 since there is no sign-extended shift right
33721 // operation on a vector with 64-bit elements.
33722 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
33723 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
33724 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
33725 N0.getOpcode() == ISD::SIGN_EXTEND)) {
33726 SDValue N00 = N0.getOperand(0);
33728 // EXTLOAD has a better solution on AVX2,
33729 // it may be replaced with X86ISD::VSEXT node.
33730 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
33731 if (!ISD::isNormalLoad(N00.getNode()))
33734 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
33735 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
33737 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
33743 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
33744 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
33745 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
33746 /// opportunities to combine math ops, use an LEA, or use a complex addressing
33747 /// mode. This can eliminate extend, add, and shift instructions.
33748 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
33749 const X86Subtarget &Subtarget) {
33750 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
33751 Ext->getOpcode() != ISD::ZERO_EXTEND)
33754 // TODO: This should be valid for other integer types.
33755 EVT VT = Ext->getValueType(0);
33756 if (VT != MVT::i64)
33759 SDValue Add = Ext->getOperand(0);
33760 if (Add.getOpcode() != ISD::ADD)
33763 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
33764 bool NSW = Add->getFlags().hasNoSignedWrap();
33765 bool NUW = Add->getFlags().hasNoUnsignedWrap();
33767 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
33769 if ((Sext && !NSW) || (!Sext && !NUW))
33772 // Having a constant operand to the 'add' ensures that we are not increasing
33773 // the instruction count because the constant is extended for free below.
33774 // A constant operand can also become the displacement field of an LEA.
33775 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33779 // Don't make the 'add' bigger if there's no hope of combining it with some
33780 // other 'add' or 'shl' instruction.
33781 // TODO: It may be profitable to generate simpler LEA instructions in place
33782 // of single 'add' instructions, but the cost model for selecting an LEA
33783 // currently has a high threshold.
33784 bool HasLEAPotential = false;
33785 for (auto *User : Ext->uses()) {
33786 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33787 HasLEAPotential = true;
33791 if (!HasLEAPotential)
33794 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33795 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33796 SDValue AddOp0 = Add.getOperand(0);
33797 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33798 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33800 // The wider add is guaranteed to not wrap because both operands are
33803 Flags.setNoSignedWrap(NSW);
33804 Flags.setNoUnsignedWrap(NUW);
33805 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
33808 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33809 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33810 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33811 /// extends from AH (which we otherwise need to do contortions to access).
33812 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33813 SDValue N0 = N->getOperand(0);
33814 auto OpcodeN = N->getOpcode();
33815 auto OpcodeN0 = N0.getOpcode();
33816 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33817 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33820 EVT VT = N->getValueType(0);
33821 EVT InVT = N0.getValueType();
33822 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33825 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33826 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33827 : X86ISD::UDIVREM8_ZEXT_HREG;
33828 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33830 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33831 return R.getValue(1);
33834 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33835 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33836 /// with UNDEFs) of the input to vectors of the same size as the target type
33837 /// which then extends the lowest elements.
33838 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33839 TargetLowering::DAGCombinerInfo &DCI,
33840 const X86Subtarget &Subtarget) {
33841 unsigned Opcode = N->getOpcode();
33842 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33844 if (!DCI.isBeforeLegalizeOps())
33846 if (!Subtarget.hasSSE2())
33849 SDValue N0 = N->getOperand(0);
33850 EVT VT = N->getValueType(0);
33851 EVT SVT = VT.getScalarType();
33852 EVT InVT = N0.getValueType();
33853 EVT InSVT = InVT.getScalarType();
33855 // Input type must be a vector and we must be extending legal integer types.
33856 if (!VT.isVector())
33858 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33860 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
33863 // On AVX2+ targets, if the input/output types are both legal then we will be
33864 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
33865 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
33866 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
33871 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
33872 EVT InVT = N.getValueType();
33873 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
33874 Size / InVT.getScalarSizeInBits());
33875 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
33876 DAG.getUNDEF(InVT));
33878 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
33881 // If target-size is less than 128-bits, extend to a type that would extend
33882 // to 128 bits, extend that and extract the original target vector.
33883 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
33884 unsigned Scale = 128 / VT.getSizeInBits();
33886 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
33887 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
33888 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
33889 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
33890 DAG.getIntPtrConstant(0, DL));
33893 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
33894 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
33895 // Also use this if we don't have SSE41 to allow the legalizer do its job.
33896 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
33897 (VT.is256BitVector() && Subtarget.hasInt256()) ||
33898 (VT.is512BitVector() && Subtarget.hasAVX512())) {
33899 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
33900 return Opcode == ISD::SIGN_EXTEND
33901 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
33902 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
33905 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
33906 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
33907 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
33908 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
33909 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
33911 SmallVector<SDValue, 8> Opnds;
33912 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
33913 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
33914 DAG.getIntPtrConstant(Offset, DL));
33915 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
33916 SrcVec = Opcode == ISD::SIGN_EXTEND
33917 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
33918 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
33919 Opnds.push_back(SrcVec);
33921 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
33924 // On pre-AVX2 targets, split into 128-bit nodes of
33925 // ISD::*_EXTEND_VECTOR_INREG.
33926 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
33927 return SplitAndExtendInReg(128);
33929 // On pre-AVX512 targets, split into 256-bit nodes of
33930 // ISD::*_EXTEND_VECTOR_INREG.
33931 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
33932 return SplitAndExtendInReg(256);
33937 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
33938 TargetLowering::DAGCombinerInfo &DCI,
33939 const X86Subtarget &Subtarget) {
33940 SDValue N0 = N->getOperand(0);
33941 EVT VT = N->getValueType(0);
33942 EVT InVT = N0.getValueType();
33945 if (SDValue DivRem8 = getDivRem8(N, DAG))
33948 if (!DCI.isBeforeLegalizeOps()) {
33949 if (InVT == MVT::i1) {
33950 SDValue Zero = DAG.getConstant(0, DL, VT);
33951 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
33952 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
33957 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
33958 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
33959 // Invert and sign-extend a boolean is the same as zero-extend and subtract
33960 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
33961 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
33962 // sext (xor Bool, -1) --> sub (zext Bool), 1
33963 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
33964 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
33967 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
33970 if (Subtarget.hasAVX() && VT.is256BitVector())
33971 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
33974 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
33980 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
33981 const X86Subtarget &Subtarget) {
33983 EVT VT = N->getValueType(0);
33985 // Let legalize expand this if it isn't a legal type yet.
33986 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33989 EVT ScalarVT = VT.getScalarType();
33990 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
33993 SDValue A = N->getOperand(0);
33994 SDValue B = N->getOperand(1);
33995 SDValue C = N->getOperand(2);
33997 auto invertIfNegative = [](SDValue &V) {
33998 if (SDValue NegVal = isFNEG(V.getNode())) {
34005 // Do not convert the passthru input of scalar intrinsics.
34006 // FIXME: We could allow negations of the lower element only.
34007 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34008 bool NegB = invertIfNegative(B);
34009 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34011 // Negative multiplication when NegA xor NegB
34012 bool NegMul = (NegA != NegB);
34014 unsigned NewOpcode;
34016 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34018 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34021 if (N->getOpcode() == X86ISD::FMADD_RND) {
34022 switch (NewOpcode) {
34023 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34024 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34025 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34026 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34028 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34029 switch (NewOpcode) {
34030 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34031 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34032 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34033 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34035 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34036 switch (NewOpcode) {
34037 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34038 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34039 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34040 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34043 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34044 "Unexpected opcode!");
34045 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34048 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34051 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34052 TargetLowering::DAGCombinerInfo &DCI,
34053 const X86Subtarget &Subtarget) {
34054 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34055 // (and (i32 x86isd::setcc_carry), 1)
34056 // This eliminates the zext. This transformation is necessary because
34057 // ISD::SETCC is always legalized to i8.
34059 SDValue N0 = N->getOperand(0);
34060 EVT VT = N->getValueType(0);
34062 if (N0.getOpcode() == ISD::AND &&
34064 N0.getOperand(0).hasOneUse()) {
34065 SDValue N00 = N0.getOperand(0);
34066 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34067 if (!isOneConstant(N0.getOperand(1)))
34069 return DAG.getNode(ISD::AND, dl, VT,
34070 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34071 N00.getOperand(0), N00.getOperand(1)),
34072 DAG.getConstant(1, dl, VT));
34076 if (N0.getOpcode() == ISD::TRUNCATE &&
34078 N0.getOperand(0).hasOneUse()) {
34079 SDValue N00 = N0.getOperand(0);
34080 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34081 return DAG.getNode(ISD::AND, dl, VT,
34082 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34083 N00.getOperand(0), N00.getOperand(1)),
34084 DAG.getConstant(1, dl, VT));
34088 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34091 if (VT.is256BitVector())
34092 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34095 if (SDValue DivRem8 = getDivRem8(N, DAG))
34098 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34101 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34107 /// Try to map a 128-bit or larger integer comparison to vector instructions
34108 /// before type legalization splits it up into chunks.
34109 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34110 const X86Subtarget &Subtarget) {
34111 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34112 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34114 // We're looking for an oversized integer equality comparison, but ignore a
34115 // comparison with zero because that gets special treatment in EmitTest().
34116 SDValue X = SetCC->getOperand(0);
34117 SDValue Y = SetCC->getOperand(1);
34118 EVT OpVT = X.getValueType();
34119 unsigned OpSize = OpVT.getSizeInBits();
34120 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34123 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34124 // TODO: Add support for AVX-512.
34125 EVT VT = SetCC->getValueType(0);
34127 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34128 (OpSize == 256 && Subtarget.hasAVX2())) {
34129 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34130 SDValue VecX = DAG.getBitcast(VecVT, X);
34131 SDValue VecY = DAG.getBitcast(VecVT, Y);
34133 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34134 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34135 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34136 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34137 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34138 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34139 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34140 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34142 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34148 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34149 const X86Subtarget &Subtarget) {
34150 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34151 SDValue LHS = N->getOperand(0);
34152 SDValue RHS = N->getOperand(1);
34153 EVT VT = N->getValueType(0);
34156 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34157 EVT OpVT = LHS.getValueType();
34158 // 0-x == y --> x+y == 0
34159 // 0-x != y --> x+y != 0
34160 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34162 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34163 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34165 // x == 0-y --> x+y == 0
34166 // x != 0-y --> x+y != 0
34167 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34169 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34170 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34173 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34177 if (VT.getScalarType() == MVT::i1 &&
34178 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34180 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34181 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34182 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34184 if (!IsSEXT0 || !IsVZero1) {
34185 // Swap the operands and update the condition code.
34186 std::swap(LHS, RHS);
34187 CC = ISD::getSetCCSwappedOperands(CC);
34189 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34190 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34191 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34194 if (IsSEXT0 && IsVZero1) {
34195 assert(VT == LHS.getOperand(0).getValueType() &&
34196 "Uexpected operand type");
34197 if (CC == ISD::SETGT)
34198 return DAG.getConstant(0, DL, VT);
34199 if (CC == ISD::SETLE)
34200 return DAG.getConstant(1, DL, VT);
34201 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34202 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34204 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34205 "Unexpected condition code!");
34206 return LHS.getOperand(0);
34210 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34211 // to avoid scalarization via legalization because v4i32 is not a legal type.
34212 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34213 LHS.getValueType() == MVT::v4f32)
34214 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34219 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34221 // Gather and Scatter instructions use k-registers for masks. The type of
34222 // the masks is v*i1. So the mask will be truncated anyway.
34223 // The SIGN_EXTEND_INREG my be dropped.
34224 SDValue Mask = N->getOperand(2);
34225 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34226 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34227 NewOps[2] = Mask.getOperand(0);
34228 DAG.UpdateNodeOperands(N, NewOps);
34233 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34234 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34235 const X86Subtarget &Subtarget) {
34237 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34238 SDValue EFLAGS = N->getOperand(1);
34240 // Try to simplify the EFLAGS and condition code operands.
34241 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34242 return getSETCC(CC, Flags, DL, DAG);
34247 /// Optimize branch condition evaluation.
34248 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34249 const X86Subtarget &Subtarget) {
34251 SDValue EFLAGS = N->getOperand(3);
34252 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34254 // Try to simplify the EFLAGS and condition code operands.
34255 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34256 // RAUW them under us.
34257 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34258 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34259 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34260 N->getOperand(1), Cond, Flags);
34266 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34267 SelectionDAG &DAG) {
34268 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34269 // optimize away operation when it's from a constant.
34271 // The general transformation is:
34272 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34273 // AND(VECTOR_CMP(x,y), constant2)
34274 // constant2 = UNARYOP(constant)
34276 // Early exit if this isn't a vector operation, the operand of the
34277 // unary operation isn't a bitwise AND, or if the sizes of the operations
34278 // aren't the same.
34279 EVT VT = N->getValueType(0);
34280 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34281 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34282 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34285 // Now check that the other operand of the AND is a constant. We could
34286 // make the transformation for non-constant splats as well, but it's unclear
34287 // that would be a benefit as it would not eliminate any operations, just
34288 // perform one more step in scalar code before moving to the vector unit.
34289 if (BuildVectorSDNode *BV =
34290 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34291 // Bail out if the vector isn't a constant.
34292 if (!BV->isConstant())
34295 // Everything checks out. Build up the new and improved node.
34297 EVT IntVT = BV->getValueType(0);
34298 // Create a new constant of the appropriate type for the transformed
34300 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34301 // The AND node needs bitcasts to/from an integer vector type around it.
34302 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34303 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34304 N->getOperand(0)->getOperand(0), MaskConst);
34305 SDValue Res = DAG.getBitcast(VT, NewAnd);
34312 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34313 const X86Subtarget &Subtarget) {
34314 SDValue Op0 = N->getOperand(0);
34315 EVT VT = N->getValueType(0);
34316 EVT InVT = Op0.getValueType();
34317 EVT InSVT = InVT.getScalarType();
34318 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34320 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34321 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34322 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34324 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34325 InVT.getVectorNumElements());
34326 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34328 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34329 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34331 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34334 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34335 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34336 // the optimization here.
34337 if (DAG.SignBitIsZero(Op0))
34338 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34343 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34344 const X86Subtarget &Subtarget) {
34345 // First try to optimize away the conversion entirely when it's
34346 // conditionally from a constant. Vectors only.
34347 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34350 // Now move on to more general possibilities.
34351 SDValue Op0 = N->getOperand(0);
34352 EVT VT = N->getValueType(0);
34353 EVT InVT = Op0.getValueType();
34354 EVT InSVT = InVT.getScalarType();
34356 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34357 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34358 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34359 if (InVT.isVector() &&
34360 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34361 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34363 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34364 InVT.getVectorNumElements());
34365 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34366 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34369 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34370 // vectors and scalars, see if we know that the upper bits are all the sign
34371 // bit, in which case we can truncate the input to i32 and convert from that.
34372 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34373 unsigned BitWidth = InVT.getScalarSizeInBits();
34374 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34375 if (NumSignBits >= (BitWidth - 31)) {
34376 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34377 if (InVT.isVector())
34378 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34379 InVT.getVectorNumElements());
34381 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34382 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34386 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34387 // a 32-bit target where SSE doesn't support i64->FP operations.
34388 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34389 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34390 EVT LdVT = Ld->getValueType(0);
34392 // This transformation is not supported if the result type is f16 or f128.
34393 if (VT == MVT::f16 || VT == MVT::f128)
34396 if (!Ld->isVolatile() && !VT.isVector() &&
34397 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34398 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34399 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34400 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34401 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34408 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34409 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34410 X86TargetLowering::DAGCombinerInfo &DCI) {
34411 // When legalizing carry, we create carries via add X, -1
34412 // If that comes from an actual carry, via setcc, we use the
34414 if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34415 SDValue Carry = N->getOperand(0);
34416 while (Carry.getOpcode() == ISD::TRUNCATE ||
34417 Carry.getOpcode() == ISD::ZERO_EXTEND ||
34418 Carry.getOpcode() == ISD::SIGN_EXTEND ||
34419 Carry.getOpcode() == ISD::ANY_EXTEND ||
34420 (Carry.getOpcode() == ISD::AND &&
34421 isOneConstant(Carry.getOperand(1))))
34422 Carry = Carry.getOperand(0);
34424 if (Carry.getOpcode() == ISD::SETCC ||
34425 Carry.getOpcode() == X86ISD::SETCC ||
34426 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34427 if (Carry.getConstantOperandVal(0) == X86::COND_B)
34428 return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
34435 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34436 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34437 X86TargetLowering::DAGCombinerInfo &DCI) {
34438 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34439 // the result is either zero or one (depending on the input carry bit).
34440 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34441 if (X86::isZeroNode(N->getOperand(0)) &&
34442 X86::isZeroNode(N->getOperand(1)) &&
34443 // We don't have a good way to replace an EFLAGS use, so only do this when
34445 SDValue(N, 1).use_empty()) {
34447 EVT VT = N->getValueType(0);
34448 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34449 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34450 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34451 DAG.getConstant(X86::COND_B, DL,
34454 DAG.getConstant(1, DL, VT));
34455 return DCI.CombineTo(N, Res1, CarryOut);
34461 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34462 /// which is more useful than 0/1 in some cases.
34463 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34465 // "Condition code B" is also known as "the carry flag" (CF).
34466 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34467 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34468 MVT VT = N->getSimpleValueType(0);
34470 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34472 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
34473 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34476 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
34477 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34478 /// with CMP+{ADC, SBB}.
34479 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34480 bool IsSub = N->getOpcode() == ISD::SUB;
34481 SDValue X = N->getOperand(0);
34482 SDValue Y = N->getOperand(1);
34484 // If this is an add, canonicalize a zext operand to the RHS.
34485 // TODO: Incomplete? What if both sides are zexts?
34486 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34487 Y.getOpcode() != ISD::ZERO_EXTEND)
34490 // Look through a one-use zext.
34491 bool PeekedThroughZext = false;
34492 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34493 Y = Y.getOperand(0);
34494 PeekedThroughZext = true;
34497 // If this is an add, canonicalize a setcc operand to the RHS.
34498 // TODO: Incomplete? What if both sides are setcc?
34499 // TODO: Should we allow peeking through a zext of the other operand?
34500 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34501 Y.getOpcode() != X86ISD::SETCC)
34504 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
34508 EVT VT = N->getValueType(0);
34509 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34511 if (CC == X86::COND_B) {
34512 // X + SETB Z --> X + (mask SBB Z, Z)
34513 // X - SETB Z --> X - (mask SBB Z, Z)
34514 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34515 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34516 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34517 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34518 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34521 if (CC == X86::COND_A) {
34522 SDValue EFLAGS = Y->getOperand(1);
34523 // Try to convert COND_A into COND_B in an attempt to facilitate
34524 // materializing "setb reg".
34526 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
34527 // cannot take an immediate as its first operand.
34529 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34530 EFLAGS.getValueType().isInteger() &&
34531 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34532 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34533 EFLAGS.getNode()->getVTList(),
34534 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34535 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34536 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34537 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34538 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34539 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34543 if (CC != X86::COND_E && CC != X86::COND_NE)
34546 SDValue Cmp = Y.getOperand(1);
34547 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
34548 !X86::isZeroNode(Cmp.getOperand(1)) ||
34549 !Cmp.getOperand(0).getValueType().isInteger())
34552 // (cmp Z, 1) sets the carry flag if Z is 0.
34553 SDValue Z = Cmp.getOperand(0);
34554 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
34555 DAG.getConstant(1, DL, Z.getValueType()));
34557 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
34559 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
34560 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
34561 if (CC == X86::COND_NE)
34562 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
34563 DAG.getConstant(-1ULL, DL, VT), NewCmp);
34565 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
34566 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
34567 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
34568 DAG.getConstant(0, DL, VT), NewCmp);
34571 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
34572 const X86Subtarget &Subtarget) {
34573 SDValue MulOp = N->getOperand(0);
34574 SDValue Phi = N->getOperand(1);
34576 if (MulOp.getOpcode() != ISD::MUL)
34577 std::swap(MulOp, Phi);
34578 if (MulOp.getOpcode() != ISD::MUL)
34582 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
34585 EVT VT = N->getValueType(0);
34587 unsigned RegSize = 128;
34588 if (Subtarget.hasBWI())
34590 else if (Subtarget.hasAVX2())
34592 unsigned VectorSize = VT.getVectorNumElements() * 16;
34593 // If the vector size is less than 128, or greater than the supported RegSize,
34594 // do not use PMADD.
34595 if (VectorSize < 128 || VectorSize > RegSize)
34599 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34600 VT.getVectorNumElements());
34601 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34602 VT.getVectorNumElements() / 2);
34604 // Shrink the operands of mul.
34605 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
34606 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
34608 // Madd vector size is half of the original vector size
34609 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
34610 // Fill the rest of the output with 0
34611 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
34612 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
34613 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
34616 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
34617 const X86Subtarget &Subtarget) {
34619 EVT VT = N->getValueType(0);
34620 SDValue Op0 = N->getOperand(0);
34621 SDValue Op1 = N->getOperand(1);
34623 // TODO: There's nothing special about i32, any integer type above i16 should
34624 // work just as well.
34625 if (!VT.isVector() || !VT.isSimple() ||
34626 !(VT.getVectorElementType() == MVT::i32))
34629 unsigned RegSize = 128;
34630 if (Subtarget.hasBWI())
34632 else if (Subtarget.hasAVX2())
34635 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
34636 // TODO: We should be able to handle larger vectors by splitting them before
34637 // feeding them into several SADs, and then reducing over those.
34638 if (VT.getSizeInBits() / 4 > RegSize)
34641 // We know N is a reduction add, which means one of its operands is a phi.
34642 // To match SAD, we need the other operand to be a vector select.
34643 SDValue SelectOp, Phi;
34644 if (Op0.getOpcode() == ISD::VSELECT) {
34647 } else if (Op1.getOpcode() == ISD::VSELECT) {
34653 // Check whether we have an abs-diff pattern feeding into the select.
34654 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
34657 // SAD pattern detected. Now build a SAD instruction and an addition for
34658 // reduction. Note that the number of elements of the result of SAD is less
34659 // than the number of elements of its input. Therefore, we could only update
34660 // part of elements in the reduction vector.
34661 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
34663 // The output of PSADBW is a vector of i64.
34664 // We need to turn the vector of i64 into a vector of i32.
34665 // If the reduction vector is at least as wide as the psadbw result, just
34666 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
34668 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
34669 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
34670 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
34672 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
34674 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
34675 // Update part of elements of the reduction vector. This is done by first
34676 // extracting a sub-vector from it, updating this sub-vector, and inserting
34678 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
34679 DAG.getIntPtrConstant(0, DL));
34680 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
34681 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
34682 DAG.getIntPtrConstant(0, DL));
34684 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
34687 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
34688 const X86Subtarget &Subtarget) {
34689 const SDNodeFlags Flags = N->getFlags();
34690 if (Flags.hasVectorReduction()) {
34691 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
34693 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
34696 EVT VT = N->getValueType(0);
34697 SDValue Op0 = N->getOperand(0);
34698 SDValue Op1 = N->getOperand(1);
34700 // Try to synthesize horizontal adds from adds of shuffles.
34701 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34702 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34703 isHorizontalBinOp(Op0, Op1, true))
34704 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
34706 return combineAddOrSubToADCOrSBB(N, DAG);
34709 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
34710 const X86Subtarget &Subtarget) {
34711 SDValue Op0 = N->getOperand(0);
34712 SDValue Op1 = N->getOperand(1);
34714 // X86 can't encode an immediate LHS of a sub. See if we can push the
34715 // negation into a preceding instruction.
34716 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
34717 // If the RHS of the sub is a XOR with one use and a constant, invert the
34718 // immediate. Then add one to the LHS of the sub so we can turn
34719 // X-Y -> X+~Y+1, saving one register.
34720 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
34721 isa<ConstantSDNode>(Op1.getOperand(1))) {
34722 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
34723 EVT VT = Op0.getValueType();
34724 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
34726 DAG.getConstant(~XorC, SDLoc(Op1), VT));
34727 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
34728 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
34732 // Try to synthesize horizontal subs from subs of shuffles.
34733 EVT VT = N->getValueType(0);
34734 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34735 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34736 isHorizontalBinOp(Op0, Op1, false))
34737 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
34739 return combineAddOrSubToADCOrSBB(N, DAG);
34742 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
34743 TargetLowering::DAGCombinerInfo &DCI,
34744 const X86Subtarget &Subtarget) {
34745 if (DCI.isBeforeLegalize())
34749 unsigned Opcode = N->getOpcode();
34750 MVT VT = N->getSimpleValueType(0);
34751 MVT SVT = VT.getVectorElementType();
34752 unsigned NumElts = VT.getVectorNumElements();
34753 unsigned EltSizeInBits = SVT.getSizeInBits();
34755 SDValue Op = N->getOperand(0);
34756 MVT OpVT = Op.getSimpleValueType();
34757 MVT OpEltVT = OpVT.getVectorElementType();
34758 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
34759 unsigned InputBits = OpEltSizeInBits * NumElts;
34761 // Perform any constant folding.
34762 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
34764 SmallVector<APInt, 64> EltBits;
34765 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
34766 APInt Undefs(NumElts, 0);
34767 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
34769 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
34770 for (unsigned i = 0; i != NumElts; ++i) {
34771 if (UndefElts[i]) {
34775 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
34776 : EltBits[i].sextOrTrunc(EltSizeInBits);
34778 return getConstVector(Vals, Undefs, VT, DAG, DL);
34781 // (vzext (bitcast (vzext (x)) -> (vzext x)
34782 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
34783 SDValue V = peekThroughBitcasts(Op);
34784 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
34785 MVT InnerVT = V.getSimpleValueType();
34786 MVT InnerEltVT = InnerVT.getVectorElementType();
34788 // If the element sizes match exactly, we can just do one larger vzext. This
34789 // is always an exact type match as vzext operates on integer types.
34790 if (OpEltVT == InnerEltVT) {
34791 assert(OpVT == InnerVT && "Types must match for vzext!");
34792 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
34795 // The only other way we can combine them is if only a single element of the
34796 // inner vzext is used in the input to the outer vzext.
34797 if (InnerEltVT.getSizeInBits() < InputBits)
34800 // In this case, the inner vzext is completely dead because we're going to
34801 // only look at bits inside of the low element. Just do the outer vzext on
34802 // a bitcast of the input to the inner.
34803 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
34806 // Check if we can bypass extracting and re-inserting an element of an input
34807 // vector. Essentially:
34808 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
34809 // TODO: Add X86ISD::VSEXT support
34810 if (Opcode == X86ISD::VZEXT &&
34811 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34812 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34813 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
34814 SDValue ExtractedV = V.getOperand(0);
34815 SDValue OrigV = ExtractedV.getOperand(0);
34816 if (isNullConstant(ExtractedV.getOperand(1))) {
34817 MVT OrigVT = OrigV.getSimpleValueType();
34818 // Extract a subvector if necessary...
34819 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
34820 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
34821 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
34822 OrigVT.getVectorNumElements() / Ratio);
34823 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
34824 DAG.getIntPtrConstant(0, DL));
34826 Op = DAG.getBitcast(OpVT, OrigV);
34827 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
34834 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
34835 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
34836 const X86Subtarget &Subtarget) {
34837 SDValue Chain = N->getOperand(0);
34838 SDValue LHS = N->getOperand(1);
34839 SDValue RHS = N->getOperand(2);
34840 MVT VT = RHS.getSimpleValueType();
34843 auto *C = dyn_cast<ConstantSDNode>(RHS);
34844 if (!C || C->getZExtValue() != 1)
34847 RHS = DAG.getConstant(-1, DL, VT);
34848 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
34849 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
34850 DAG.getVTList(MVT::i32, MVT::Other),
34851 {Chain, LHS, RHS}, VT, MMO);
34854 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
34855 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
34856 SDValue Op0 = N->getOperand(0);
34857 SDValue Op1 = N->getOperand(1);
34859 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
34862 EVT VT = N->getValueType(0);
34865 return DAG.getNode(X86ISD::TESTM, DL, VT,
34866 Op0->getOperand(0), Op0->getOperand(1));
34869 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
34870 const X86Subtarget &Subtarget) {
34871 MVT VT = N->getSimpleValueType(0);
34874 if (N->getOperand(0) == N->getOperand(1)) {
34875 if (N->getOpcode() == X86ISD::PCMPEQ)
34876 return getOnesVector(VT, DAG, DL);
34877 if (N->getOpcode() == X86ISD::PCMPGT)
34878 return getZeroVector(VT, Subtarget, DAG, DL);
34884 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
34885 TargetLowering::DAGCombinerInfo &DCI,
34886 const X86Subtarget &Subtarget) {
34887 if (DCI.isBeforeLegalizeOps())
34891 SDValue Vec = N->getOperand(0);
34892 SDValue SubVec = N->getOperand(1);
34893 SDValue Idx = N->getOperand(2);
34895 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
34896 MVT OpVT = N->getSimpleValueType(0);
34897 MVT SubVecVT = SubVec.getSimpleValueType();
34899 // If this is an insert of an extract, combine to a shuffle. Don't do this
34900 // if the insert or extract can be represented with a subvector operation.
34901 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
34902 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
34903 (IdxVal != 0 || !Vec.isUndef())) {
34904 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
34905 if (ExtIdxVal != 0) {
34906 int VecNumElts = OpVT.getVectorNumElements();
34907 int SubVecNumElts = SubVecVT.getVectorNumElements();
34908 SmallVector<int, 64> Mask(VecNumElts);
34909 // First create an identity shuffle mask.
34910 for (int i = 0; i != VecNumElts; ++i)
34912 // Now insert the extracted portion.
34913 for (int i = 0; i != SubVecNumElts; ++i)
34914 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
34916 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
34920 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
34922 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34923 // (load16 addr + 16), Elts/2)
34926 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34927 // (load32 addr + 32), Elts/2)
34929 // or a 16-byte or 32-byte broadcast:
34930 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34931 // (load16 addr), Elts/2)
34932 // --> X86SubVBroadcast(load16 addr)
34934 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34935 // (load32 addr), Elts/2)
34936 // --> X86SubVBroadcast(load32 addr)
34937 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
34938 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
34939 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
34940 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
34941 if (Idx2 && Idx2->getZExtValue() == 0) {
34942 SDValue SubVec2 = Vec.getOperand(1);
34943 // If needed, look through bitcasts to get to the load.
34944 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
34946 unsigned Alignment = FirstLd->getAlignment();
34947 unsigned AS = FirstLd->getAddressSpace();
34948 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
34949 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
34950 OpVT, AS, Alignment, &Fast) && Fast) {
34951 SDValue Ops[] = {SubVec2, SubVec};
34952 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
34956 // If lower/upper loads are the same and the only users of the load, then
34957 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
34958 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
34959 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
34960 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
34961 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
34964 // If this is subv_broadcast insert into both halves, use a larger
34966 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
34967 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
34968 SubVec.getOperand(0));
34977 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
34978 DAGCombinerInfo &DCI) const {
34979 SelectionDAG &DAG = DCI.DAG;
34980 switch (N->getOpcode()) {
34982 case ISD::EXTRACT_VECTOR_ELT:
34983 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
34984 case X86ISD::PEXTRW:
34985 case X86ISD::PEXTRB:
34986 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
34987 case ISD::INSERT_SUBVECTOR:
34988 return combineInsertSubvector(N, DAG, DCI, Subtarget);
34991 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
34992 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
34993 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
34994 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
34995 case ISD::SUB: return combineSub(N, DAG, Subtarget);
34996 case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
34997 case X86ISD::ADC: return combineADC(N, DAG, DCI);
34998 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35001 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35002 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35003 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35004 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35005 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35006 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35007 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35008 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35009 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35010 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35012 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35013 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35014 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35015 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35016 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35017 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35019 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35021 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35023 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35024 case X86ISD::BT: return combineBT(N, DAG, DCI);
35025 case ISD::ANY_EXTEND:
35026 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35027 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35028 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35029 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35030 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35031 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35032 case X86ISD::VSHLI:
35033 case X86ISD::VSRAI:
35034 case X86ISD::VSRLI:
35035 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35036 case ISD::SIGN_EXTEND_VECTOR_INREG:
35037 case ISD::ZERO_EXTEND_VECTOR_INREG:
35038 case X86ISD::VSEXT:
35039 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35040 case X86ISD::PINSRB:
35041 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35042 case X86ISD::SHUFP: // Handle all target specific shuffles
35043 case X86ISD::INSERTPS:
35044 case X86ISD::PALIGNR:
35045 case X86ISD::VSHLDQ:
35046 case X86ISD::VSRLDQ:
35047 case X86ISD::BLENDI:
35048 case X86ISD::UNPCKH:
35049 case X86ISD::UNPCKL:
35050 case X86ISD::MOVHLPS:
35051 case X86ISD::MOVLHPS:
35052 case X86ISD::PSHUFB:
35053 case X86ISD::PSHUFD:
35054 case X86ISD::PSHUFHW:
35055 case X86ISD::PSHUFLW:
35056 case X86ISD::MOVSHDUP:
35057 case X86ISD::MOVSLDUP:
35058 case X86ISD::MOVDDUP:
35059 case X86ISD::MOVSS:
35060 case X86ISD::MOVSD:
35061 case X86ISD::VPPERM:
35062 case X86ISD::VPERMI:
35063 case X86ISD::VPERMV:
35064 case X86ISD::VPERMV3:
35065 case X86ISD::VPERMIV3:
35066 case X86ISD::VPERMIL2:
35067 case X86ISD::VPERMILPI:
35068 case X86ISD::VPERMILPV:
35069 case X86ISD::VPERM2X128:
35070 case X86ISD::VZEXT_MOVL:
35071 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35072 case X86ISD::FMADD:
35073 case X86ISD::FMADD_RND:
35074 case X86ISD::FMADDS1_RND:
35075 case X86ISD::FMADDS3_RND:
35076 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35078 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35079 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35080 case X86ISD::TESTM: return combineTestM(N, DAG);
35081 case X86ISD::PCMPEQ:
35082 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35088 /// Return true if the target has native support for the specified value type
35089 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35090 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35091 /// some i16 instructions are slow.
35092 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35093 if (!isTypeLegal(VT))
35095 if (VT != MVT::i16)
35102 case ISD::SIGN_EXTEND:
35103 case ISD::ZERO_EXTEND:
35104 case ISD::ANY_EXTEND:
35117 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35118 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35119 /// we don't adjust the stack we clobber the first frame index.
35120 /// See X86InstrInfo::copyPhysReg.
35121 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35122 const MachineRegisterInfo &MRI = MF.getRegInfo();
35123 return any_of(MRI.reg_instructions(X86::EFLAGS),
35124 [](const MachineInstr &RI) { return RI.isCopy(); });
35127 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35128 if (hasCopyImplyingStackAdjustment(MF)) {
35129 MachineFrameInfo &MFI = MF.getFrameInfo();
35130 MFI.setHasCopyImplyingStackAdjustment(true);
35133 TargetLoweringBase::finalizeLowering(MF);
35136 /// This method query the target whether it is beneficial for dag combiner to
35137 /// promote the specified node. If true, it should return the desired promotion
35138 /// type by reference.
35139 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35140 EVT VT = Op.getValueType();
35141 if (VT != MVT::i16)
35144 bool Promote = false;
35145 bool Commute = false;
35146 switch (Op.getOpcode()) {
35148 case ISD::SIGN_EXTEND:
35149 case ISD::ZERO_EXTEND:
35150 case ISD::ANY_EXTEND:
35155 SDValue N0 = Op.getOperand(0);
35156 // Look out for (store (shl (load), x)).
35157 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35170 SDValue N0 = Op.getOperand(0);
35171 SDValue N1 = Op.getOperand(1);
35172 if (!Commute && MayFoldLoad(N1))
35174 // Avoid disabling potential load folding opportunities.
35175 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35177 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35187 //===----------------------------------------------------------------------===//
35188 // X86 Inline Assembly Support
35189 //===----------------------------------------------------------------------===//
35191 // Helper to match a string separated by whitespace.
35192 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35193 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35195 for (StringRef Piece : Pieces) {
35196 if (!S.startswith(Piece)) // Check if the piece matches.
35199 S = S.substr(Piece.size());
35200 StringRef::size_type Pos = S.find_first_not_of(" \t");
35201 if (Pos == 0) // We matched a prefix.
35210 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35212 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35213 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35214 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35215 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35217 if (AsmPieces.size() == 3)
35219 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35226 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35227 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35229 const std::string &AsmStr = IA->getAsmString();
35231 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35232 if (!Ty || Ty->getBitWidth() % 16 != 0)
35235 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35236 SmallVector<StringRef, 4> AsmPieces;
35237 SplitString(AsmStr, AsmPieces, ";\n");
35239 switch (AsmPieces.size()) {
35240 default: return false;
35242 // FIXME: this should verify that we are targeting a 486 or better. If not,
35243 // we will turn this bswap into something that will be lowered to logical
35244 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35245 // lower so don't worry about this.
35247 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35248 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35249 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35250 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35251 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35252 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35253 // No need to check constraints, nothing other than the equivalent of
35254 // "=r,0" would be valid here.
35255 return IntrinsicLowering::LowerToByteSwap(CI);
35258 // rorw $$8, ${0:w} --> llvm.bswap.i16
35259 if (CI->getType()->isIntegerTy(16) &&
35260 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35261 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35262 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35264 StringRef ConstraintsStr = IA->getConstraintString();
35265 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35266 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35267 if (clobbersFlagRegisters(AsmPieces))
35268 return IntrinsicLowering::LowerToByteSwap(CI);
35272 if (CI->getType()->isIntegerTy(32) &&
35273 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35274 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35275 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35276 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35278 StringRef ConstraintsStr = IA->getConstraintString();
35279 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35280 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35281 if (clobbersFlagRegisters(AsmPieces))
35282 return IntrinsicLowering::LowerToByteSwap(CI);
35285 if (CI->getType()->isIntegerTy(64)) {
35286 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35287 if (Constraints.size() >= 2 &&
35288 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35289 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35290 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35291 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35292 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35293 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35294 return IntrinsicLowering::LowerToByteSwap(CI);
35302 /// Given a constraint letter, return the type of constraint for this target.
35303 X86TargetLowering::ConstraintType
35304 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35305 if (Constraint.size() == 1) {
35306 switch (Constraint[0]) {
35318 return C_RegisterClass;
35319 case 'k': // AVX512 masking registers.
35343 else if (Constraint.size() == 2) {
35344 switch (Constraint[0]) {
35348 switch (Constraint[1]) {
35356 return TargetLowering::getConstraintType(Constraint);
35359 /// Examine constraint type and operand type and determine a weight value.
35360 /// This object must already have been set up with the operand type
35361 /// and the current alternative constraint selected.
35362 TargetLowering::ConstraintWeight
35363 X86TargetLowering::getSingleConstraintMatchWeight(
35364 AsmOperandInfo &info, const char *constraint) const {
35365 ConstraintWeight weight = CW_Invalid;
35366 Value *CallOperandVal = info.CallOperandVal;
35367 // If we don't have a value, we can't do a match,
35368 // but allow it at the lowest weight.
35369 if (!CallOperandVal)
35371 Type *type = CallOperandVal->getType();
35372 // Look at the constraint type.
35373 switch (*constraint) {
35375 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35386 if (CallOperandVal->getType()->isIntegerTy())
35387 weight = CW_SpecificReg;
35392 if (type->isFloatingPointTy())
35393 weight = CW_SpecificReg;
35396 if (type->isX86_MMXTy() && Subtarget.hasMMX())
35397 weight = CW_SpecificReg;
35400 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35401 if (constraint[1] == 'k') {
35402 // Support for 'Yk' (similarly to the 'k' variant below).
35403 weight = CW_SpecificReg;
35406 // Else fall through (handle "Y" constraint).
35409 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35410 weight = CW_Register;
35413 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
35414 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35415 weight = CW_Register;
35418 // Enable conditional vector operations using %k<#> registers.
35419 weight = CW_SpecificReg;
35422 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35423 if (C->getZExtValue() <= 31)
35424 weight = CW_Constant;
35428 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35429 if (C->getZExtValue() <= 63)
35430 weight = CW_Constant;
35434 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35435 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35436 weight = CW_Constant;
35440 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35441 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
35442 weight = CW_Constant;
35446 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35447 if (C->getZExtValue() <= 3)
35448 weight = CW_Constant;
35452 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35453 if (C->getZExtValue() <= 0xff)
35454 weight = CW_Constant;
35459 if (isa<ConstantFP>(CallOperandVal)) {
35460 weight = CW_Constant;
35464 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35465 if ((C->getSExtValue() >= -0x80000000LL) &&
35466 (C->getSExtValue() <= 0x7fffffffLL))
35467 weight = CW_Constant;
35471 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35472 if (C->getZExtValue() <= 0xffffffff)
35473 weight = CW_Constant;
35480 /// Try to replace an X constraint, which matches anything, with another that
35481 /// has more specific requirements based on the type of the corresponding
35483 const char *X86TargetLowering::
35484 LowerXConstraint(EVT ConstraintVT) const {
35485 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
35486 // 'f' like normal targets.
35487 if (ConstraintVT.isFloatingPoint()) {
35488 if (Subtarget.hasSSE2())
35490 if (Subtarget.hasSSE1())
35494 return TargetLowering::LowerXConstraint(ConstraintVT);
35497 /// Lower the specified operand into the Ops vector.
35498 /// If it is invalid, don't add anything to Ops.
35499 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
35500 std::string &Constraint,
35501 std::vector<SDValue>&Ops,
35502 SelectionDAG &DAG) const {
35505 // Only support length 1 constraints for now.
35506 if (Constraint.length() > 1) return;
35508 char ConstraintLetter = Constraint[0];
35509 switch (ConstraintLetter) {
35512 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35513 if (C->getZExtValue() <= 31) {
35514 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35515 Op.getValueType());
35521 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35522 if (C->getZExtValue() <= 63) {
35523 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35524 Op.getValueType());
35530 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35531 if (isInt<8>(C->getSExtValue())) {
35532 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35533 Op.getValueType());
35539 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35540 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
35541 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
35542 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
35543 Op.getValueType());
35549 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35550 if (C->getZExtValue() <= 3) {
35551 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35552 Op.getValueType());
35558 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35559 if (C->getZExtValue() <= 255) {
35560 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35561 Op.getValueType());
35567 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35568 if (C->getZExtValue() <= 127) {
35569 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35570 Op.getValueType());
35576 // 32-bit signed value
35577 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35578 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35579 C->getSExtValue())) {
35580 // Widen to 64 bits here to get it sign extended.
35581 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
35584 // FIXME gcc accepts some relocatable values here too, but only in certain
35585 // memory models; it's complicated.
35590 // 32-bit unsigned value
35591 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35592 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35593 C->getZExtValue())) {
35594 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35595 Op.getValueType());
35599 // FIXME gcc accepts some relocatable values here too, but only in certain
35600 // memory models; it's complicated.
35604 // Literal immediates are always ok.
35605 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
35606 // Widen to 64 bits here to get it sign extended.
35607 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
35611 // In any sort of PIC mode addresses need to be computed at runtime by
35612 // adding in a register or some sort of table lookup. These can't
35613 // be used as immediates.
35614 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
35617 // If we are in non-pic codegen mode, we allow the address of a global (with
35618 // an optional displacement) to be used with 'i'.
35619 GlobalAddressSDNode *GA = nullptr;
35620 int64_t Offset = 0;
35622 // Match either (GA), (GA+C), (GA+C1+C2), etc.
35624 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
35625 Offset += GA->getOffset();
35627 } else if (Op.getOpcode() == ISD::ADD) {
35628 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35629 Offset += C->getZExtValue();
35630 Op = Op.getOperand(0);
35633 } else if (Op.getOpcode() == ISD::SUB) {
35634 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35635 Offset += -C->getZExtValue();
35636 Op = Op.getOperand(0);
35641 // Otherwise, this isn't something we can handle, reject it.
35645 const GlobalValue *GV = GA->getGlobal();
35646 // If we require an extra load to get this address, as in PIC mode, we
35647 // can't accept it.
35648 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
35651 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
35652 GA->getValueType(0), Offset);
35657 if (Result.getNode()) {
35658 Ops.push_back(Result);
35661 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
35664 /// Check if \p RC is a general purpose register class.
35665 /// I.e., GR* or one of their variant.
35666 static bool isGRClass(const TargetRegisterClass &RC) {
35667 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
35668 RC.hasSuperClassEq(&X86::GR16RegClass) ||
35669 RC.hasSuperClassEq(&X86::GR32RegClass) ||
35670 RC.hasSuperClassEq(&X86::GR64RegClass) ||
35671 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
35674 /// Check if \p RC is a vector register class.
35675 /// I.e., FR* / VR* or one of their variant.
35676 static bool isFRClass(const TargetRegisterClass &RC) {
35677 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
35678 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
35679 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
35680 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
35681 RC.hasSuperClassEq(&X86::VR512RegClass);
35684 std::pair<unsigned, const TargetRegisterClass *>
35685 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
35686 StringRef Constraint,
35688 // First, see if this is a constraint that directly corresponds to an LLVM
35690 if (Constraint.size() == 1) {
35691 // GCC Constraint Letters
35692 switch (Constraint[0]) {
35694 // TODO: Slight differences here in allocation order and leaving
35695 // RIP in the class. Do they matter any more here than they do
35696 // in the normal allocation?
35698 if (Subtarget.hasAVX512()) {
35699 // Only supported in AVX512 or later.
35700 switch (VT.SimpleTy) {
35703 return std::make_pair(0U, &X86::VK32RegClass);
35705 return std::make_pair(0U, &X86::VK16RegClass);
35707 return std::make_pair(0U, &X86::VK8RegClass);
35709 return std::make_pair(0U, &X86::VK1RegClass);
35711 return std::make_pair(0U, &X86::VK64RegClass);
35715 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
35716 if (Subtarget.is64Bit()) {
35717 if (VT == MVT::i32 || VT == MVT::f32)
35718 return std::make_pair(0U, &X86::GR32RegClass);
35719 if (VT == MVT::i16)
35720 return std::make_pair(0U, &X86::GR16RegClass);
35721 if (VT == MVT::i8 || VT == MVT::i1)
35722 return std::make_pair(0U, &X86::GR8RegClass);
35723 if (VT == MVT::i64 || VT == MVT::f64)
35724 return std::make_pair(0U, &X86::GR64RegClass);
35727 // 32-bit fallthrough
35728 case 'Q': // Q_REGS
35729 if (VT == MVT::i32 || VT == MVT::f32)
35730 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
35731 if (VT == MVT::i16)
35732 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
35733 if (VT == MVT::i8 || VT == MVT::i1)
35734 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
35735 if (VT == MVT::i64)
35736 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
35738 case 'r': // GENERAL_REGS
35739 case 'l': // INDEX_REGS
35740 if (VT == MVT::i8 || VT == MVT::i1)
35741 return std::make_pair(0U, &X86::GR8RegClass);
35742 if (VT == MVT::i16)
35743 return std::make_pair(0U, &X86::GR16RegClass);
35744 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
35745 return std::make_pair(0U, &X86::GR32RegClass);
35746 return std::make_pair(0U, &X86::GR64RegClass);
35747 case 'R': // LEGACY_REGS
35748 if (VT == MVT::i8 || VT == MVT::i1)
35749 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
35750 if (VT == MVT::i16)
35751 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
35752 if (VT == MVT::i32 || !Subtarget.is64Bit())
35753 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
35754 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
35755 case 'f': // FP Stack registers.
35756 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
35757 // value to the correct fpstack register class.
35758 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
35759 return std::make_pair(0U, &X86::RFP32RegClass);
35760 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
35761 return std::make_pair(0U, &X86::RFP64RegClass);
35762 return std::make_pair(0U, &X86::RFP80RegClass);
35763 case 'y': // MMX_REGS if MMX allowed.
35764 if (!Subtarget.hasMMX()) break;
35765 return std::make_pair(0U, &X86::VR64RegClass);
35766 case 'Y': // SSE_REGS if SSE2 allowed
35767 if (!Subtarget.hasSSE2()) break;
35770 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
35771 if (!Subtarget.hasSSE1()) break;
35772 bool VConstraint = (Constraint[0] == 'v');
35774 switch (VT.SimpleTy) {
35776 // Scalar SSE types.
35779 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
35780 return std::make_pair(0U, &X86::FR32XRegClass);
35781 return std::make_pair(0U, &X86::FR32RegClass);
35784 if (VConstraint && Subtarget.hasVLX())
35785 return std::make_pair(0U, &X86::FR64XRegClass);
35786 return std::make_pair(0U, &X86::FR64RegClass);
35787 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35795 if (VConstraint && Subtarget.hasVLX())
35796 return std::make_pair(0U, &X86::VR128XRegClass);
35797 return std::make_pair(0U, &X86::VR128RegClass);
35805 if (VConstraint && Subtarget.hasVLX())
35806 return std::make_pair(0U, &X86::VR256XRegClass);
35807 return std::make_pair(0U, &X86::VR256RegClass);
35812 return std::make_pair(0U, &X86::VR512RegClass);
35816 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
35817 switch (Constraint[1]) {
35821 // This register class doesn't allocate k0 for masked vector operation.
35822 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
35823 switch (VT.SimpleTy) {
35826 return std::make_pair(0U, &X86::VK32WMRegClass);
35828 return std::make_pair(0U, &X86::VK16WMRegClass);
35830 return std::make_pair(0U, &X86::VK8WMRegClass);
35832 return std::make_pair(0U, &X86::VK1WMRegClass);
35834 return std::make_pair(0U, &X86::VK64WMRegClass);
35841 // Use the default implementation in TargetLowering to convert the register
35842 // constraint into a member of a register class.
35843 std::pair<unsigned, const TargetRegisterClass*> Res;
35844 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
35846 // Not found as a standard register?
35848 // Map st(0) -> st(7) -> ST0
35849 if (Constraint.size() == 7 && Constraint[0] == '{' &&
35850 tolower(Constraint[1]) == 's' &&
35851 tolower(Constraint[2]) == 't' &&
35852 Constraint[3] == '(' &&
35853 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
35854 Constraint[5] == ')' &&
35855 Constraint[6] == '}') {
35857 Res.first = X86::FP0+Constraint[4]-'0';
35858 Res.second = &X86::RFP80RegClass;
35862 // GCC allows "st(0)" to be called just plain "st".
35863 if (StringRef("{st}").equals_lower(Constraint)) {
35864 Res.first = X86::FP0;
35865 Res.second = &X86::RFP80RegClass;
35870 if (StringRef("{flags}").equals_lower(Constraint)) {
35871 Res.first = X86::EFLAGS;
35872 Res.second = &X86::CCRRegClass;
35876 // 'A' means [ER]AX + [ER]DX.
35877 if (Constraint == "A") {
35878 if (Subtarget.is64Bit()) {
35879 Res.first = X86::RAX;
35880 Res.second = &X86::GR64_ADRegClass;
35882 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
35883 "Expecting 64, 32 or 16 bit subtarget");
35884 Res.first = X86::EAX;
35885 Res.second = &X86::GR32_ADRegClass;
35892 // Otherwise, check to see if this is a register class of the wrong value
35893 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
35894 // turn into {ax},{dx}.
35895 // MVT::Other is used to specify clobber names.
35896 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
35897 return Res; // Correct type already, nothing to do.
35899 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
35900 // return "eax". This should even work for things like getting 64bit integer
35901 // registers when given an f64 type.
35902 const TargetRegisterClass *Class = Res.second;
35903 // The generic code will match the first register class that contains the
35904 // given register. Thus, based on the ordering of the tablegened file,
35905 // the "plain" GR classes might not come first.
35906 // Therefore, use a helper method.
35907 if (isGRClass(*Class)) {
35908 unsigned Size = VT.getSizeInBits();
35909 if (Size == 1) Size = 8;
35910 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
35912 Res.first = DestReg;
35913 Res.second = Size == 8 ? &X86::GR8RegClass
35914 : Size == 16 ? &X86::GR16RegClass
35915 : Size == 32 ? &X86::GR32RegClass
35916 : &X86::GR64RegClass;
35917 assert(Res.second->contains(Res.first) && "Register in register class");
35919 // No register found/type mismatch.
35921 Res.second = nullptr;
35923 } else if (isFRClass(*Class)) {
35924 // Handle references to XMM physical registers that got mapped into the
35925 // wrong class. This can happen with constraints like {xmm0} where the
35926 // target independent register mapper will just pick the first match it can
35927 // find, ignoring the required type.
35929 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35930 if (VT == MVT::f32 || VT == MVT::i32)
35931 Res.second = &X86::FR32RegClass;
35932 else if (VT == MVT::f64 || VT == MVT::i64)
35933 Res.second = &X86::FR64RegClass;
35934 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
35935 Res.second = &X86::VR128RegClass;
35936 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
35937 Res.second = &X86::VR256RegClass;
35938 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
35939 Res.second = &X86::VR512RegClass;
35941 // Type mismatch and not a clobber: Return an error;
35943 Res.second = nullptr;
35950 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
35951 const AddrMode &AM, Type *Ty,
35952 unsigned AS) const {
35953 // Scaling factors are not free at all.
35954 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
35955 // will take 2 allocations in the out of order engine instead of 1
35956 // for plain addressing mode, i.e. inst (reg1).
35958 // vaddps (%rsi,%drx), %ymm0, %ymm1
35959 // Requires two allocations (one for the load, one for the computation)
35961 // vaddps (%rsi), %ymm0, %ymm1
35962 // Requires just 1 allocation, i.e., freeing allocations for other operations
35963 // and having less micro operations to execute.
35965 // For some X86 architectures, this is even worse because for instance for
35966 // stores, the complex addressing mode forces the instruction to use the
35967 // "load" ports instead of the dedicated "store" port.
35968 // E.g., on Haswell:
35969 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
35970 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
35971 if (isLegalAddressingMode(DL, AM, Ty, AS))
35972 // Scale represents reg2 * scale, thus account for 1
35973 // as soon as we use a second register.
35974 return AM.Scale != 0;
35978 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
35979 // Integer division on x86 is expensive. However, when aggressively optimizing
35980 // for code size, we prefer to use a div instruction, as it is usually smaller
35981 // than the alternative sequence.
35982 // The exception to this is vector division. Since x86 doesn't have vector
35983 // integer division, leaving the division as-is is a loss even in terms of
35984 // size, because it will have to be scalarized, while the alternative code
35985 // sequence can be performed in vector form.
35987 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
35988 return OptSize && !VT.isVector();
35991 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
35992 if (!Subtarget.is64Bit())
35995 // Update IsSplitCSR in X86MachineFunctionInfo.
35996 X86MachineFunctionInfo *AFI =
35997 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
35998 AFI->setIsSplitCSR(true);
36001 void X86TargetLowering::insertCopiesSplitCSR(
36002 MachineBasicBlock *Entry,
36003 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36004 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36005 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36009 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36010 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36011 MachineBasicBlock::iterator MBBI = Entry->begin();
36012 for (const MCPhysReg *I = IStart; *I; ++I) {
36013 const TargetRegisterClass *RC = nullptr;
36014 if (X86::GR64RegClass.contains(*I))
36015 RC = &X86::GR64RegClass;
36017 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36019 unsigned NewVR = MRI->createVirtualRegister(RC);
36020 // Create copy from CSR to a virtual register.
36021 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36022 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36023 // nounwind. If we want to generalize this later, we may need to emit
36024 // CFI pseudo-instructions.
36025 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36026 Attribute::NoUnwind) &&
36027 "Function should be nounwind in insertCopiesSplitCSR!");
36028 Entry->addLiveIn(*I);
36029 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36032 // Insert the copy-back instructions right before the terminator.
36033 for (auto *Exit : Exits)
36034 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36035 TII->get(TargetOpcode::COPY), *I)
36040 bool X86TargetLowering::supportSwiftError() const {
36041 return Subtarget.is64Bit();