1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86ShuffleDecodeConstantPool.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/EHPersonalities.h"
30 #include "llvm/CodeGen/IntrinsicLowering.h"
31 #include "llvm/CodeGen/MachineFrameInfo.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
35 #include "llvm/CodeGen/MachineModuleInfo.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/WinEHFuncInfo.h"
38 #include "llvm/IR/CallSite.h"
39 #include "llvm/IR/CallingConv.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalAlias.h"
44 #include "llvm/IR/GlobalVariable.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/Intrinsics.h"
47 #include "llvm/MC/MCAsmInfo.h"
48 #include "llvm/MC/MCContext.h"
49 #include "llvm/MC/MCExpr.h"
50 #include "llvm/MC/MCSymbol.h"
51 #include "llvm/Support/CommandLine.h"
52 #include "llvm/Support/Debug.h"
53 #include "llvm/Support/ErrorHandling.h"
54 #include "llvm/Support/MathExtras.h"
55 #include "llvm/Target/TargetOptions.h"
56 #include "X86IntrinsicsInfo.h"
62 #define DEBUG_TYPE "x86-isel"
64 STATISTIC(NumTailCalls, "Number of tail calls");
66 static cl::opt<bool> ExperimentalVectorWideningLegalization(
67 "x86-experimental-vector-widening-legalization", cl::init(false),
68 cl::desc("Enable an experimental vector type legalization through widening "
69 "rather than promotion."),
72 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
73 const X86Subtarget &STI)
74 : TargetLowering(TM), Subtarget(STI) {
75 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
76 X86ScalarSSEf64 = Subtarget.hasSSE2();
77 X86ScalarSSEf32 = Subtarget.hasSSE1();
78 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
80 // Set up the TargetLowering object.
82 // X86 is weird. It always uses i8 for shift amounts and setcc results.
83 setBooleanContents(ZeroOrOneBooleanContent);
84 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
85 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
87 // For 64-bit, since we have so many registers, use the ILP scheduler.
88 // For 32-bit, use the register pressure specific scheduling.
89 // For Atom, always use ILP scheduling.
90 if (Subtarget.isAtom())
91 setSchedulingPreference(Sched::ILP);
92 else if (Subtarget.is64Bit())
93 setSchedulingPreference(Sched::ILP);
95 setSchedulingPreference(Sched::RegPressure);
96 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
97 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
99 // Bypass expensive divides on Atom when compiling with O2.
100 if (TM.getOptLevel() >= CodeGenOpt::Default) {
101 if (Subtarget.hasSlowDivide32())
102 addBypassSlowDiv(32, 8);
103 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
104 addBypassSlowDiv(64, 16);
107 if (Subtarget.isTargetKnownWindowsMSVC()) {
108 // Setup Windows compiler runtime calls.
109 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
110 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
111 setLibcallName(RTLIB::SREM_I64, "_allrem");
112 setLibcallName(RTLIB::UREM_I64, "_aullrem");
113 setLibcallName(RTLIB::MUL_I64, "_allmul");
114 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
115 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
116 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
117 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
118 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
121 if (Subtarget.isTargetDarwin()) {
122 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
123 setUseUnderscoreSetJmp(false);
124 setUseUnderscoreLongJmp(false);
125 } else if (Subtarget.isTargetWindowsGNU()) {
126 // MS runtime is weird: it exports _setjmp, but longjmp!
127 setUseUnderscoreSetJmp(true);
128 setUseUnderscoreLongJmp(false);
130 setUseUnderscoreSetJmp(true);
131 setUseUnderscoreLongJmp(true);
134 // Set up the register classes.
135 addRegisterClass(MVT::i8, &X86::GR8RegClass);
136 addRegisterClass(MVT::i16, &X86::GR16RegClass);
137 addRegisterClass(MVT::i32, &X86::GR32RegClass);
138 if (Subtarget.is64Bit())
139 addRegisterClass(MVT::i64, &X86::GR64RegClass);
141 for (MVT VT : MVT::integer_valuetypes())
142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
144 // We don't accept any truncstore of integer registers.
145 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
146 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
147 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
148 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
149 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
150 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
152 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
154 // SETOEQ and SETUNE require checking two conditions.
155 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
156 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
157 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
158 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
159 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
160 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
162 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
164 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
165 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
166 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
168 if (Subtarget.is64Bit()) {
169 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
170 // f32/f64 are legal, f80 is custom.
171 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
174 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
175 } else if (!Subtarget.useSoftFloat()) {
176 // We have an algorithm for SSE2->double, and we turn this into a
177 // 64-bit FILD followed by conditional FADD for other targets.
178 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
179 // We have an algorithm for SSE2, and we turn this into a 64-bit
180 // FILD or VCVTUSI2SS/SD for other targets.
181 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
184 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
186 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
187 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
189 if (!Subtarget.useSoftFloat()) {
190 // SSE has no i16 to fp conversion, only i32.
191 if (X86ScalarSSEf32) {
192 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
193 // f32 and f64 cases are Legal, f80 case is not
194 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
196 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
197 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
200 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
201 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
204 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
206 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
207 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
209 if (!Subtarget.useSoftFloat()) {
210 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
211 // are Legal, f80 is custom lowered.
212 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
213 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
215 if (X86ScalarSSEf32) {
216 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
217 // f32 and f64 cases are Legal, f80 case is not
218 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
220 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
221 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
224 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
225 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
226 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
229 // Handle FP_TO_UINT by promoting the destination to a larger signed
231 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
232 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
233 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
235 if (Subtarget.is64Bit()) {
236 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
237 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
238 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
239 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
241 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
242 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
244 } else if (!Subtarget.useSoftFloat()) {
245 // Since AVX is a superset of SSE3, only check for SSE here.
246 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
247 // Expand FP_TO_UINT into a select.
248 // FIXME: We would like to use a Custom expander here eventually to do
249 // the optimal thing for SSE vs. the default expansion in the legalizer.
250 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
252 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
253 // With SSE3 we can use fisttpll to convert to a signed i64; without
254 // SSE, we're stuck with a fistpll.
255 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
257 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
260 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
261 if (!X86ScalarSSEf64) {
262 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
263 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
264 if (Subtarget.is64Bit()) {
265 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
266 // Without SSE, i64->f64 goes through memory.
267 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
269 } else if (!Subtarget.is64Bit())
270 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
272 // Scalar integer divide and remainder are lowered to use operations that
273 // produce two results, to match the available instructions. This exposes
274 // the two-result form to trivial CSE, which is able to combine x/y and x%y
275 // into a single instruction.
277 // Scalar integer multiply-high is also lowered to use two-result
278 // operations, to match the available instructions. However, plain multiply
279 // (low) operations are left as Legal, as there are single-result
280 // instructions for this in x86. Using the two-result multiply instructions
281 // when both high and low results are needed must be arranged by dagcombine.
282 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
283 setOperationAction(ISD::MULHS, VT, Expand);
284 setOperationAction(ISD::MULHU, VT, Expand);
285 setOperationAction(ISD::SDIV, VT, Expand);
286 setOperationAction(ISD::UDIV, VT, Expand);
287 setOperationAction(ISD::SREM, VT, Expand);
288 setOperationAction(ISD::UREM, VT, Expand);
290 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
291 setOperationAction(ISD::ADDC, VT, Custom);
292 setOperationAction(ISD::ADDE, VT, Custom);
293 setOperationAction(ISD::SUBC, VT, Custom);
294 setOperationAction(ISD::SUBE, VT, Custom);
297 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
298 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
299 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
300 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
301 setOperationAction(ISD::BR_CC, VT, Expand);
302 setOperationAction(ISD::SELECT_CC, VT, Expand);
304 if (Subtarget.is64Bit())
305 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
306 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
307 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
308 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
309 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
311 setOperationAction(ISD::FREM , MVT::f32 , Expand);
312 setOperationAction(ISD::FREM , MVT::f64 , Expand);
313 setOperationAction(ISD::FREM , MVT::f80 , Expand);
314 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
316 // Promote the i8 variants and force them on up to i32 which has a shorter
318 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
319 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
320 if (!Subtarget.hasBMI()) {
321 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
322 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
323 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
324 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
325 if (Subtarget.is64Bit()) {
326 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
327 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
331 if (Subtarget.hasLZCNT()) {
332 // When promoting the i8 variants, force them to i32 for a shorter
334 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
335 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
337 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
338 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
339 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
340 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
341 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
342 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
343 if (Subtarget.is64Bit()) {
344 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
345 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
349 // Special handling for half-precision floating point conversions.
350 // If we don't have F16C support, then lower half float conversions
351 // into library calls.
352 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
353 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
354 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
357 // There's never any support for operations beyond MVT::f32.
358 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
359 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
360 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
361 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
363 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
364 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
365 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
366 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
367 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
368 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
370 if (Subtarget.hasPOPCNT()) {
371 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
373 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
374 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
375 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
376 if (Subtarget.is64Bit())
377 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
380 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
382 if (!Subtarget.hasMOVBE())
383 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
385 // These should be promoted to a larger select which is supported.
386 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
387 // X86 wants to expand cmov itself.
388 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
389 setOperationAction(ISD::SELECT, VT, Custom);
390 setOperationAction(ISD::SETCC, VT, Custom);
392 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
393 if (VT == MVT::i64 && !Subtarget.is64Bit())
395 setOperationAction(ISD::SELECT, VT, Custom);
396 setOperationAction(ISD::SETCC, VT, Custom);
397 setOperationAction(ISD::SETCCE, VT, Custom);
399 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
400 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
401 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
402 // support continuation, user-level threading, and etc.. As a result, no
403 // other SjLj exception interfaces are implemented and please don't build
404 // your own exception handling based on them.
405 // LLVM/Clang supports zero-cost DWARF exception handling.
406 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
407 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
408 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
409 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
410 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
413 for (auto VT : { MVT::i32, MVT::i64 }) {
414 if (VT == MVT::i64 && !Subtarget.is64Bit())
416 setOperationAction(ISD::ConstantPool , VT, Custom);
417 setOperationAction(ISD::JumpTable , VT, Custom);
418 setOperationAction(ISD::GlobalAddress , VT, Custom);
419 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
420 setOperationAction(ISD::ExternalSymbol , VT, Custom);
421 setOperationAction(ISD::BlockAddress , VT, Custom);
423 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
424 for (auto VT : { MVT::i32, MVT::i64 }) {
425 if (VT == MVT::i64 && !Subtarget.is64Bit())
427 setOperationAction(ISD::SHL_PARTS, VT, Custom);
428 setOperationAction(ISD::SRA_PARTS, VT, Custom);
429 setOperationAction(ISD::SRL_PARTS, VT, Custom);
432 if (Subtarget.hasSSE1())
433 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
435 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
437 // Expand certain atomics
438 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
439 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
440 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
441 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
442 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
443 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
444 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
445 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
448 if (Subtarget.hasCmpxchg16b()) {
449 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
452 // FIXME - use subtarget debug flags
453 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
454 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
455 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
456 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
459 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
460 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
462 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
463 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
465 setOperationAction(ISD::TRAP, MVT::Other, Legal);
466 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
468 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
469 setOperationAction(ISD::VASTART , MVT::Other, Custom);
470 setOperationAction(ISD::VAEND , MVT::Other, Expand);
471 bool Is64Bit = Subtarget.is64Bit();
472 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
473 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
475 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
476 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
478 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
480 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
481 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
482 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
484 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
485 // f32 and f64 use SSE.
486 // Set up the FP register classes.
487 addRegisterClass(MVT::f32, &X86::FR32RegClass);
488 addRegisterClass(MVT::f64, &X86::FR64RegClass);
490 for (auto VT : { MVT::f32, MVT::f64 }) {
491 // Use ANDPD to simulate FABS.
492 setOperationAction(ISD::FABS, VT, Custom);
494 // Use XORP to simulate FNEG.
495 setOperationAction(ISD::FNEG, VT, Custom);
497 // Use ANDPD and ORPD to simulate FCOPYSIGN.
498 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
500 // We don't support sin/cos/fmod
501 setOperationAction(ISD::FSIN , VT, Expand);
502 setOperationAction(ISD::FCOS , VT, Expand);
503 setOperationAction(ISD::FSINCOS, VT, Expand);
506 // Lower this to MOVMSK plus an AND.
507 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
508 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
510 // Expand FP immediates into loads from the stack, except for the special
512 addLegalFPImmediate(APFloat(+0.0)); // xorpd
513 addLegalFPImmediate(APFloat(+0.0f)); // xorps
514 } else if (UseX87 && X86ScalarSSEf32) {
515 // Use SSE for f32, x87 for f64.
516 // Set up the FP register classes.
517 addRegisterClass(MVT::f32, &X86::FR32RegClass);
518 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
520 // Use ANDPS to simulate FABS.
521 setOperationAction(ISD::FABS , MVT::f32, Custom);
523 // Use XORP to simulate FNEG.
524 setOperationAction(ISD::FNEG , MVT::f32, Custom);
526 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
528 // Use ANDPS and ORPS to simulate FCOPYSIGN.
529 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
530 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
532 // We don't support sin/cos/fmod
533 setOperationAction(ISD::FSIN , MVT::f32, Expand);
534 setOperationAction(ISD::FCOS , MVT::f32, Expand);
535 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
537 // Special cases we handle for FP constants.
538 addLegalFPImmediate(APFloat(+0.0f)); // xorps
539 addLegalFPImmediate(APFloat(+0.0)); // FLD0
540 addLegalFPImmediate(APFloat(+1.0)); // FLD1
541 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
542 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
544 if (!TM.Options.UnsafeFPMath) {
545 setOperationAction(ISD::FSIN , MVT::f64, Expand);
546 setOperationAction(ISD::FCOS , MVT::f64, Expand);
547 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
550 // f32 and f64 in x87.
551 // Set up the FP register classes.
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
553 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
555 for (auto VT : { MVT::f32, MVT::f64 }) {
556 setOperationAction(ISD::UNDEF, VT, Expand);
557 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
559 if (!TM.Options.UnsafeFPMath) {
560 setOperationAction(ISD::FSIN , VT, Expand);
561 setOperationAction(ISD::FCOS , VT, Expand);
562 setOperationAction(ISD::FSINCOS, VT, Expand);
565 addLegalFPImmediate(APFloat(+0.0)); // FLD0
566 addLegalFPImmediate(APFloat(+1.0)); // FLD1
567 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
568 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
569 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
570 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
571 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
572 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
575 // We don't support FMA.
576 setOperationAction(ISD::FMA, MVT::f64, Expand);
577 setOperationAction(ISD::FMA, MVT::f32, Expand);
579 // Long double always uses X87, except f128 in MMX.
581 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
582 addRegisterClass(MVT::f128, &X86::FR128RegClass);
583 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
584 setOperationAction(ISD::FABS , MVT::f128, Custom);
585 setOperationAction(ISD::FNEG , MVT::f128, Custom);
586 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
589 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
590 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
591 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
593 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
594 addLegalFPImmediate(TmpFlt); // FLD0
596 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
599 APFloat TmpFlt2(+1.0);
600 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
602 addLegalFPImmediate(TmpFlt2); // FLD1
603 TmpFlt2.changeSign();
604 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
607 if (!TM.Options.UnsafeFPMath) {
608 setOperationAction(ISD::FSIN , MVT::f80, Expand);
609 setOperationAction(ISD::FCOS , MVT::f80, Expand);
610 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
613 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
614 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
615 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
616 setOperationAction(ISD::FRINT, MVT::f80, Expand);
617 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
618 setOperationAction(ISD::FMA, MVT::f80, Expand);
621 // Always use a library call for pow.
622 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
623 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
624 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
626 setOperationAction(ISD::FLOG, MVT::f80, Expand);
627 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
628 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
629 setOperationAction(ISD::FEXP, MVT::f80, Expand);
630 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
631 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
632 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
634 // Some FP actions are always expanded for vector types.
635 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
636 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
637 setOperationAction(ISD::FSIN, VT, Expand);
638 setOperationAction(ISD::FSINCOS, VT, Expand);
639 setOperationAction(ISD::FCOS, VT, Expand);
640 setOperationAction(ISD::FREM, VT, Expand);
641 setOperationAction(ISD::FPOWI, VT, Expand);
642 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
643 setOperationAction(ISD::FPOW, VT, Expand);
644 setOperationAction(ISD::FLOG, VT, Expand);
645 setOperationAction(ISD::FLOG2, VT, Expand);
646 setOperationAction(ISD::FLOG10, VT, Expand);
647 setOperationAction(ISD::FEXP, VT, Expand);
648 setOperationAction(ISD::FEXP2, VT, Expand);
651 // First set operation action for all vector types to either promote
652 // (for widening) or expand (for scalarization). Then we will selectively
653 // turn on ones that can be effectively codegen'd.
654 for (MVT VT : MVT::vector_valuetypes()) {
655 setOperationAction(ISD::SDIV, VT, Expand);
656 setOperationAction(ISD::UDIV, VT, Expand);
657 setOperationAction(ISD::SREM, VT, Expand);
658 setOperationAction(ISD::UREM, VT, Expand);
659 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
660 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
661 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
662 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
663 setOperationAction(ISD::FMA, VT, Expand);
664 setOperationAction(ISD::FFLOOR, VT, Expand);
665 setOperationAction(ISD::FCEIL, VT, Expand);
666 setOperationAction(ISD::FTRUNC, VT, Expand);
667 setOperationAction(ISD::FRINT, VT, Expand);
668 setOperationAction(ISD::FNEARBYINT, VT, Expand);
669 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
670 setOperationAction(ISD::MULHS, VT, Expand);
671 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
672 setOperationAction(ISD::MULHU, VT, Expand);
673 setOperationAction(ISD::SDIVREM, VT, Expand);
674 setOperationAction(ISD::UDIVREM, VT, Expand);
675 setOperationAction(ISD::CTPOP, VT, Expand);
676 setOperationAction(ISD::CTTZ, VT, Expand);
677 setOperationAction(ISD::CTLZ, VT, Expand);
678 setOperationAction(ISD::ROTL, VT, Expand);
679 setOperationAction(ISD::ROTR, VT, Expand);
680 setOperationAction(ISD::BSWAP, VT, Expand);
681 setOperationAction(ISD::SETCC, VT, Expand);
682 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
683 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
684 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
685 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
686 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
687 setOperationAction(ISD::TRUNCATE, VT, Expand);
688 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
689 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
690 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
691 setOperationAction(ISD::SELECT_CC, VT, Expand);
692 for (MVT InnerVT : MVT::vector_valuetypes()) {
693 setTruncStoreAction(InnerVT, VT, Expand);
695 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
696 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
698 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
699 // types, we have to deal with them whether we ask for Expansion or not.
700 // Setting Expand causes its own optimisation problems though, so leave
702 if (VT.getVectorElementType() == MVT::i1)
703 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
705 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
706 // split/scalarized right now.
707 if (VT.getVectorElementType() == MVT::f16)
708 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
712 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
713 // with -msoft-float, disable use of MMX as well.
714 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
715 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
716 // No operations on x86mmx supported, everything uses intrinsics.
719 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
720 addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
722 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
723 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
724 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
725 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
726 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
727 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
728 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
729 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
732 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
733 addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
735 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
736 // registers cannot be used even for integer operations.
737 addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
738 addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
739 addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
740 addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
742 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
743 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
744 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
745 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
746 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
747 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
748 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
749 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
750 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
751 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
752 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
753 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
755 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
756 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
757 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
758 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
760 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
761 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
762 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
763 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
765 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
766 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
767 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
768 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
769 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
771 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
772 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
773 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
774 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
777 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
778 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
779 // ISD::CTTZ v2i64 - scalarization is faster.
781 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
782 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
783 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
784 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
785 setOperationAction(ISD::VSELECT, VT, Custom);
786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
789 // We support custom legalizing of sext and anyext loads for specific
790 // memory vector types which we can load as a scalar (or sequence of
791 // scalars) and extend in-register to a legal 128-bit vector type. For sext
792 // loads these must work with a single scalar load.
793 for (MVT VT : MVT::integer_vector_valuetypes()) {
794 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
795 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
796 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
797 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
798 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
799 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
800 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
801 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
802 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
805 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
806 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
807 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
808 setOperationAction(ISD::VSELECT, VT, Custom);
810 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
813 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
817 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
818 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
819 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
820 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
821 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
822 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
823 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
826 // Custom lower v2i64 and v2f64 selects.
827 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
828 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
830 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
831 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
833 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
835 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
836 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
837 // As there is no 64-bit GPR available, we need build a special custom
838 // sequence to convert from v2i32 to v2f32.
839 if (!Subtarget.is64Bit())
840 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
842 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
843 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
845 for (MVT VT : MVT::fp_vector_valuetypes())
846 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
848 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
849 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
850 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
852 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
853 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
854 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
856 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
857 setOperationAction(ISD::SRL, VT, Custom);
858 setOperationAction(ISD::SHL, VT, Custom);
859 setOperationAction(ISD::SRA, VT, Custom);
862 // In the customized shift lowering, the legal cases in AVX2 will be
864 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
865 setOperationAction(ISD::SRL, VT, Custom);
866 setOperationAction(ISD::SHL, VT, Custom);
867 setOperationAction(ISD::SRA, VT, Custom);
871 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
872 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
873 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
874 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
875 // ISD::CTLZ v4i32 - scalarization is faster.
876 // ISD::CTLZ v2i64 - scalarization is faster.
879 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
880 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
881 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
882 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
883 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
884 setOperationAction(ISD::FRINT, RoundedTy, Legal);
885 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
888 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
889 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
890 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
891 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
892 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
893 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
894 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
895 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
897 // FIXME: Do we need to handle scalar-to-vector here?
898 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
900 // We directly match byte blends in the backend as they match the VSELECT
902 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
904 // SSE41 brings specific instructions for doing vector sign extend even in
905 // cases where we don't have SRA.
906 for (MVT VT : MVT::integer_vector_valuetypes()) {
907 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
908 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
909 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
912 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
913 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
914 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
915 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
916 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
917 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
918 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
920 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
921 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
922 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
923 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
924 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
925 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
927 // i8 vectors are custom because the source register and source
928 // source memory operand types are not the same width.
929 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
932 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
933 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
934 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
935 setOperationAction(ISD::ROTL, VT, Custom);
937 // XOP can efficiently perform BITREVERSE with VPPERM.
938 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
939 setOperationAction(ISD::BITREVERSE, VT, Custom);
941 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
942 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
943 setOperationAction(ISD::BITREVERSE, VT, Custom);
946 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
947 bool HasInt256 = Subtarget.hasInt256();
949 addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
950 addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
951 addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
952 addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
953 addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
954 addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
956 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
957 setOperationAction(ISD::FFLOOR, VT, Legal);
958 setOperationAction(ISD::FCEIL, VT, Legal);
959 setOperationAction(ISD::FTRUNC, VT, Legal);
960 setOperationAction(ISD::FRINT, VT, Legal);
961 setOperationAction(ISD::FNEARBYINT, VT, Legal);
962 setOperationAction(ISD::FNEG, VT, Custom);
963 setOperationAction(ISD::FABS, VT, Custom);
966 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
967 // even though v8i16 is a legal type.
968 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
969 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
970 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
972 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
973 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
974 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
976 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
977 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
979 for (MVT VT : MVT::fp_vector_valuetypes())
980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
982 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
983 setOperationAction(ISD::SRL, VT, Custom);
984 setOperationAction(ISD::SHL, VT, Custom);
985 setOperationAction(ISD::SRA, VT, Custom);
988 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
989 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
990 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
991 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
993 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
994 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
995 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
997 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
998 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
999 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1000 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1001 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1002 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1003 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1004 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1005 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1006 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1007 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1008 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1009 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::CTPOP, VT, Custom);
1013 setOperationAction(ISD::CTTZ, VT, Custom);
1016 // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
1017 // as we end up splitting the 256-bit vectors.
1018 for (auto VT : { MVT::v32i8, MVT::v16i16 })
1019 setOperationAction(ISD::CTLZ, VT, Custom);
1022 for (auto VT : { MVT::v8i32, MVT::v4i64 })
1023 setOperationAction(ISD::CTLZ, VT, Custom);
1025 if (Subtarget.hasAnyFMA()) {
1026 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1027 MVT::v2f64, MVT::v4f64 })
1028 setOperationAction(ISD::FMA, VT, Legal);
1031 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1032 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1033 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1036 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1037 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1038 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1039 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1041 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1042 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1044 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1045 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1046 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1047 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1050 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1057 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1058 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1059 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1061 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1062 // when we have a 256bit-wide blend with immediate.
1063 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1065 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1066 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1067 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1068 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1069 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1070 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1071 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1073 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1074 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1075 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1076 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1077 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1078 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1081 // In the customized shift lowering, the legal cases in AVX2 will be
1083 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1084 setOperationAction(ISD::SRL, VT, Custom);
1085 setOperationAction(ISD::SHL, VT, Custom);
1086 setOperationAction(ISD::SRA, VT, Custom);
1089 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1090 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1091 setOperationAction(ISD::MLOAD, VT, Legal);
1092 setOperationAction(ISD::MSTORE, VT, Legal);
1095 // Extract subvector is special because the value type
1096 // (result) is 128-bit but the source is 256-bit wide.
1097 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1098 MVT::v4f32, MVT::v2f64 }) {
1099 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1102 // Custom lower several nodes for 256-bit types.
1103 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1104 MVT::v8f32, MVT::v4f64 }) {
1105 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1106 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1107 setOperationAction(ISD::VSELECT, VT, Custom);
1108 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1109 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1110 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1111 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1112 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1116 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1118 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1119 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1120 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1121 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1122 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1128 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1129 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1130 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1131 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1132 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1134 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1135 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1136 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1138 for (MVT VT : MVT::fp_vector_valuetypes())
1139 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1141 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1142 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1143 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1144 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1145 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1146 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1147 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1149 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1150 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1151 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1152 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1153 setOperationAction(ISD::XOR, MVT::i1, Legal);
1154 setOperationAction(ISD::OR, MVT::i1, Legal);
1155 setOperationAction(ISD::AND, MVT::i1, Legal);
1156 setOperationAction(ISD::SUB, MVT::i1, Custom);
1157 setOperationAction(ISD::ADD, MVT::i1, Custom);
1158 setOperationAction(ISD::MUL, MVT::i1, Custom);
1160 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1161 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1162 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1163 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1164 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1165 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1166 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1167 setTruncStoreAction(VT, MaskVT, Custom);
1170 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1171 setOperationAction(ISD::FNEG, VT, Custom);
1172 setOperationAction(ISD::FABS, VT, Custom);
1173 setOperationAction(ISD::FMA, VT, Legal);
1176 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1177 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1178 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1179 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1180 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1181 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1183 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1184 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1187 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1189 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1190 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1191 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1193 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1194 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1195 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1196 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1197 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1198 if (Subtarget.hasVLX()){
1199 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1200 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1201 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1202 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1203 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1205 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1206 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1207 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1208 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1209 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1211 setOperationAction(ISD::MLOAD, MVT::v8i32, Custom);
1212 setOperationAction(ISD::MLOAD, MVT::v8f32, Custom);
1213 setOperationAction(ISD::MSTORE, MVT::v8i32, Custom);
1214 setOperationAction(ISD::MSTORE, MVT::v8f32, Custom);
1216 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1217 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1218 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1219 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1220 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1221 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1222 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1223 if (Subtarget.hasDQI()) {
1224 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1225 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1226 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1227 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1228 if (Subtarget.hasVLX()) {
1229 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1230 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1231 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1232 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1233 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1234 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1235 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1236 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1239 if (Subtarget.hasVLX()) {
1240 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1241 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1242 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1243 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1244 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1245 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1246 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1247 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1248 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1249 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1251 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1252 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1253 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1254 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1255 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1256 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1258 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1259 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1260 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1261 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1264 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1265 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1266 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1267 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1268 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1269 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1270 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1271 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1272 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1273 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1274 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1276 if (Subtarget.hasDQI()) {
1277 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1278 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1280 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1281 setOperationAction(ISD::FFLOOR, VT, Legal);
1282 setOperationAction(ISD::FCEIL, VT, Legal);
1283 setOperationAction(ISD::FTRUNC, VT, Legal);
1284 setOperationAction(ISD::FRINT, VT, Legal);
1285 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1288 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1289 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1290 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1291 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1292 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1294 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1295 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1297 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1299 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1300 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1301 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1302 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1303 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1304 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1305 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1306 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1307 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1308 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1309 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1310 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1312 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1313 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1314 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1315 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1316 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1317 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1318 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1319 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1321 setOperationAction(ISD::ADD, MVT::v8i1, Expand);
1322 setOperationAction(ISD::ADD, MVT::v16i1, Expand);
1323 setOperationAction(ISD::SUB, MVT::v8i1, Expand);
1324 setOperationAction(ISD::SUB, MVT::v16i1, Expand);
1325 setOperationAction(ISD::MUL, MVT::v8i1, Expand);
1326 setOperationAction(ISD::MUL, MVT::v16i1, Expand);
1328 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1330 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331 setOperationAction(ISD::SRL, VT, Custom);
1332 setOperationAction(ISD::SHL, VT, Custom);
1333 setOperationAction(ISD::SRA, VT, Custom);
1334 setOperationAction(ISD::AND, VT, Legal);
1335 setOperationAction(ISD::OR, VT, Legal);
1336 setOperationAction(ISD::XOR, VT, Legal);
1337 setOperationAction(ISD::CTPOP, VT, Custom);
1338 setOperationAction(ISD::CTTZ, VT, Custom);
1341 if (Subtarget.hasCDI()) {
1342 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1343 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1345 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1346 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1347 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1348 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1351 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1353 if (Subtarget.hasVLX()) {
1354 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1355 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1356 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1357 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1359 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1360 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1361 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1362 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1365 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1366 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1367 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1368 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1369 } // Subtarget.hasCDI()
1371 if (Subtarget.hasDQI()) {
1372 if (Subtarget.hasVLX()) {
1373 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1374 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1376 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1378 // Custom lower several nodes.
1379 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1380 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1381 setOperationAction(ISD::MGATHER, VT, Custom);
1382 setOperationAction(ISD::MSCATTER, VT, Custom);
1384 // Extract subvector is special because the value type
1385 // (result) is 256-bit but the source is 512-bit wide.
1386 // 128-bit was made Custom under AVX1.
1387 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1388 MVT::v8f32, MVT::v4f64 })
1389 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1390 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1391 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1392 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1394 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1395 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1396 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1397 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1398 setOperationAction(ISD::VSELECT, VT, Legal);
1399 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1401 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1402 setOperationAction(ISD::MLOAD, VT, Legal);
1403 setOperationAction(ISD::MSTORE, VT, Legal);
1404 setOperationAction(ISD::MGATHER, VT, Legal);
1405 setOperationAction(ISD::MSCATTER, VT, Custom);
1407 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1408 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1412 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1413 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1414 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1416 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1417 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1419 setOperationAction(ISD::ADD, MVT::v32i1, Expand);
1420 setOperationAction(ISD::ADD, MVT::v64i1, Expand);
1421 setOperationAction(ISD::SUB, MVT::v32i1, Expand);
1422 setOperationAction(ISD::SUB, MVT::v64i1, Expand);
1423 setOperationAction(ISD::MUL, MVT::v32i1, Expand);
1424 setOperationAction(ISD::MUL, MVT::v64i1, Expand);
1426 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1427 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1428 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1429 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1430 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1431 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1432 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1433 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1434 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1435 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1436 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1437 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1438 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
1440 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1441 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1442 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1443 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1444 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1445 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1446 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1447 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1448 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1449 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1450 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1451 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1452 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1453 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1454 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1455 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1456 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1457 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1458 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1459 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1460 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1461 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1462 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1463 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1464 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1465 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1466 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1467 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1468 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1469 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1470 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1472 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1473 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1474 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1475 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1476 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1477 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1478 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1479 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1481 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1482 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1483 if (Subtarget.hasVLX())
1484 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1486 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1487 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1488 setOperationAction(ISD::MLOAD, VT, Action);
1489 setOperationAction(ISD::MSTORE, VT, Action);
1492 if (Subtarget.hasCDI()) {
1493 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1494 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1497 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1498 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1499 setOperationAction(ISD::VSELECT, VT, Legal);
1500 setOperationAction(ISD::SRL, VT, Custom);
1501 setOperationAction(ISD::SHL, VT, Custom);
1502 setOperationAction(ISD::SRA, VT, Custom);
1503 setOperationAction(ISD::MLOAD, VT, Legal);
1504 setOperationAction(ISD::MSTORE, VT, Legal);
1505 setOperationAction(ISD::CTPOP, VT, Custom);
1506 setOperationAction(ISD::CTTZ, VT, Custom);
1508 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1509 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1510 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1513 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1514 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1515 if (Subtarget.hasVLX()) {
1516 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1517 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1518 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1523 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1524 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1525 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1527 setOperationAction(ISD::ADD, MVT::v2i1, Expand);
1528 setOperationAction(ISD::ADD, MVT::v4i1, Expand);
1529 setOperationAction(ISD::SUB, MVT::v2i1, Expand);
1530 setOperationAction(ISD::SUB, MVT::v4i1, Expand);
1531 setOperationAction(ISD::MUL, MVT::v2i1, Expand);
1532 setOperationAction(ISD::MUL, MVT::v4i1, Expand);
1534 setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
1535 setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
1536 setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
1537 setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
1538 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1539 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1540 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1541 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1542 setOperationAction(ISD::SELECT, MVT::v4i1, Custom);
1543 setOperationAction(ISD::SELECT, MVT::v2i1, Custom);
1544 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
1545 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom);
1547 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom);
1548 setOperationAction(ISD::VSELECT, MVT::v2i1, Expand);
1549 setOperationAction(ISD::VSELECT, MVT::v4i1, Expand);
1551 for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
1552 setOperationAction(ISD::AND, VT, Legal);
1553 setOperationAction(ISD::OR, VT, Legal);
1554 setOperationAction(ISD::XOR, VT, Legal);
1557 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1558 setOperationAction(ISD::SMAX, VT, Legal);
1559 setOperationAction(ISD::UMAX, VT, Legal);
1560 setOperationAction(ISD::SMIN, VT, Legal);
1561 setOperationAction(ISD::UMIN, VT, Legal);
1565 // We want to custom lower some of our intrinsics.
1566 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1567 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1568 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1569 if (!Subtarget.is64Bit()) {
1570 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1571 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1574 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1575 // handle type legalization for these operations here.
1577 // FIXME: We really should do custom legalization for addition and
1578 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1579 // than generic legalization for 64-bit multiplication-with-overflow, though.
1580 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1581 if (VT == MVT::i64 && !Subtarget.is64Bit())
1583 // Add/Sub/Mul with overflow operations are custom lowered.
1584 setOperationAction(ISD::SADDO, VT, Custom);
1585 setOperationAction(ISD::UADDO, VT, Custom);
1586 setOperationAction(ISD::SSUBO, VT, Custom);
1587 setOperationAction(ISD::USUBO, VT, Custom);
1588 setOperationAction(ISD::SMULO, VT, Custom);
1589 setOperationAction(ISD::UMULO, VT, Custom);
1592 if (!Subtarget.is64Bit()) {
1593 // These libcalls are not available in 32-bit.
1594 setLibcallName(RTLIB::SHL_I128, nullptr);
1595 setLibcallName(RTLIB::SRL_I128, nullptr);
1596 setLibcallName(RTLIB::SRA_I128, nullptr);
1599 // Combine sin / cos into one node or libcall if possible.
1600 if (Subtarget.hasSinCos()) {
1601 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1602 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1603 if (Subtarget.isTargetDarwin()) {
1604 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1605 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1606 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1607 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1611 if (Subtarget.isTargetWin64()) {
1612 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1613 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1614 setOperationAction(ISD::SREM, MVT::i128, Custom);
1615 setOperationAction(ISD::UREM, MVT::i128, Custom);
1616 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1617 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1620 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1621 // is. We should promote the value to 64-bits to solve this.
1622 // This is what the CRT headers do - `fmodf` is an inline header
1623 // function casting to f64 and calling `fmod`.
1624 if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
1625 for (ISD::NodeType Op :
1626 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1627 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1628 if (isOperationExpand(Op, MVT::f32))
1629 setOperationAction(Op, MVT::f32, Promote);
1631 // We have target-specific dag combine patterns for the following nodes:
1632 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1633 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1634 setTargetDAGCombine(ISD::BITCAST);
1635 setTargetDAGCombine(ISD::VSELECT);
1636 setTargetDAGCombine(ISD::SELECT);
1637 setTargetDAGCombine(ISD::SHL);
1638 setTargetDAGCombine(ISD::SRA);
1639 setTargetDAGCombine(ISD::SRL);
1640 setTargetDAGCombine(ISD::OR);
1641 setTargetDAGCombine(ISD::AND);
1642 setTargetDAGCombine(ISD::ADD);
1643 setTargetDAGCombine(ISD::FADD);
1644 setTargetDAGCombine(ISD::FSUB);
1645 setTargetDAGCombine(ISD::FNEG);
1646 setTargetDAGCombine(ISD::FMA);
1647 setTargetDAGCombine(ISD::FMINNUM);
1648 setTargetDAGCombine(ISD::FMAXNUM);
1649 setTargetDAGCombine(ISD::SUB);
1650 setTargetDAGCombine(ISD::LOAD);
1651 setTargetDAGCombine(ISD::MLOAD);
1652 setTargetDAGCombine(ISD::STORE);
1653 setTargetDAGCombine(ISD::MSTORE);
1654 setTargetDAGCombine(ISD::TRUNCATE);
1655 setTargetDAGCombine(ISD::ZERO_EXTEND);
1656 setTargetDAGCombine(ISD::ANY_EXTEND);
1657 setTargetDAGCombine(ISD::SIGN_EXTEND);
1658 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659 setTargetDAGCombine(ISD::SINT_TO_FP);
1660 setTargetDAGCombine(ISD::UINT_TO_FP);
1661 setTargetDAGCombine(ISD::SETCC);
1662 setTargetDAGCombine(ISD::MUL);
1663 setTargetDAGCombine(ISD::XOR);
1664 setTargetDAGCombine(ISD::MSCATTER);
1665 setTargetDAGCombine(ISD::MGATHER);
1667 computeRegisterProperties(Subtarget.getRegisterInfo());
1669 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1670 MaxStoresPerMemsetOptSize = 8;
1671 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1672 MaxStoresPerMemcpyOptSize = 4;
1673 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1674 MaxStoresPerMemmoveOptSize = 4;
1675 setPrefLoopAlignment(4); // 2^4 bytes.
1677 // An out-of-order CPU can speculatively execute past a predictable branch,
1678 // but a conditional move could be stalled by an expensive earlier operation.
1679 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1680 EnableExtLdPromotion = true;
1681 setPrefFunctionAlignment(4); // 2^4 bytes.
1683 verifyIntrinsicTables();
1686 // This has so far only been implemented for 64-bit MachO.
1687 bool X86TargetLowering::useLoadStackGuardNode() const {
1688 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1691 TargetLoweringBase::LegalizeTypeAction
1692 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1693 if (ExperimentalVectorWideningLegalization &&
1694 VT.getVectorNumElements() != 1 &&
1695 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1696 return TypeWidenVector;
1698 return TargetLoweringBase::getPreferredVectorAction(VT);
1701 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1702 LLVMContext& Context,
1705 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1707 if (VT.isSimple()) {
1708 MVT VVT = VT.getSimpleVT();
1709 const unsigned NumElts = VVT.getVectorNumElements();
1710 MVT EltVT = VVT.getVectorElementType();
1711 if (VVT.is512BitVector()) {
1712 if (Subtarget.hasAVX512())
1713 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1714 EltVT == MVT::f32 || EltVT == MVT::f64)
1716 case 8: return MVT::v8i1;
1717 case 16: return MVT::v16i1;
1719 if (Subtarget.hasBWI())
1720 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1722 case 32: return MVT::v32i1;
1723 case 64: return MVT::v64i1;
1727 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1728 return MVT::getVectorVT(MVT::i1, NumElts);
1730 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1731 EVT LegalVT = getTypeToTransformTo(Context, VT);
1732 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1735 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1737 case 2: return MVT::v2i1;
1738 case 4: return MVT::v4i1;
1739 case 8: return MVT::v8i1;
1743 return VT.changeVectorElementTypeToInteger();
1746 /// Helper for getByValTypeAlignment to determine
1747 /// the desired ByVal argument alignment.
1748 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1751 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1752 if (VTy->getBitWidth() == 128)
1754 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1755 unsigned EltAlign = 0;
1756 getMaxByValAlign(ATy->getElementType(), EltAlign);
1757 if (EltAlign > MaxAlign)
1758 MaxAlign = EltAlign;
1759 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1760 for (auto *EltTy : STy->elements()) {
1761 unsigned EltAlign = 0;
1762 getMaxByValAlign(EltTy, EltAlign);
1763 if (EltAlign > MaxAlign)
1764 MaxAlign = EltAlign;
1771 /// Return the desired alignment for ByVal aggregate
1772 /// function arguments in the caller parameter area. For X86, aggregates
1773 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1774 /// are at 4-byte boundaries.
1775 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1776 const DataLayout &DL) const {
1777 if (Subtarget.is64Bit()) {
1778 // Max of 8 and alignment of type.
1779 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1786 if (Subtarget.hasSSE1())
1787 getMaxByValAlign(Ty, Align);
1791 /// Returns the target specific optimal type for load
1792 /// and store operations as a result of memset, memcpy, and memmove
1793 /// lowering. If DstAlign is zero that means it's safe to destination
1794 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1795 /// means there isn't a need to check it against alignment requirement,
1796 /// probably because the source does not need to be loaded. If 'IsMemset' is
1797 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1798 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1799 /// source is constant so it does not need to be loaded.
1800 /// It returns EVT::Other if the type should be determined using generic
1801 /// target-independent logic.
1803 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1804 unsigned DstAlign, unsigned SrcAlign,
1805 bool IsMemset, bool ZeroMemset,
1807 MachineFunction &MF) const {
1808 const Function *F = MF.getFunction();
1809 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1811 (!Subtarget.isUnalignedMem16Slow() ||
1812 ((DstAlign == 0 || DstAlign >= 16) &&
1813 (SrcAlign == 0 || SrcAlign >= 16)))) {
1814 // FIXME: Check if unaligned 32-byte accesses are slow.
1815 if (Size >= 32 && Subtarget.hasAVX()) {
1816 // Although this isn't a well-supported type for AVX1, we'll let
1817 // legalization and shuffle lowering produce the optimal codegen. If we
1818 // choose an optimal type with a vector element larger than a byte,
1819 // getMemsetStores() may create an intermediate splat (using an integer
1820 // multiply) before we splat as a vector.
1823 if (Subtarget.hasSSE2())
1825 // TODO: Can SSE1 handle a byte vector?
1826 if (Subtarget.hasSSE1())
1828 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1829 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1830 // Do not use f64 to lower memcpy if source is string constant. It's
1831 // better to use i32 to avoid the loads.
1832 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1833 // The gymnastics of splatting a byte value into an XMM register and then
1834 // only using 8-byte stores (because this is a CPU with slow unaligned
1835 // 16-byte accesses) makes that a loser.
1839 // This is a compromise. If we reach here, unaligned accesses may be slow on
1840 // this target. However, creating smaller, aligned accesses could be even
1841 // slower and would certainly be a lot more code.
1842 if (Subtarget.is64Bit() && Size >= 8)
1847 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1849 return X86ScalarSSEf32;
1850 else if (VT == MVT::f64)
1851 return X86ScalarSSEf64;
1856 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1861 switch (VT.getSizeInBits()) {
1863 // 8-byte and under are always assumed to be fast.
1867 *Fast = !Subtarget.isUnalignedMem16Slow();
1870 *Fast = !Subtarget.isUnalignedMem32Slow();
1872 // TODO: What about AVX-512 (512-bit) accesses?
1875 // Misaligned accesses of any size are always allowed.
1879 /// Return the entry encoding for a jump table in the
1880 /// current function. The returned value is a member of the
1881 /// MachineJumpTableInfo::JTEntryKind enum.
1882 unsigned X86TargetLowering::getJumpTableEncoding() const {
1883 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1885 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1886 return MachineJumpTableInfo::EK_Custom32;
1888 // Otherwise, use the normal jump table encoding heuristics.
1889 return TargetLowering::getJumpTableEncoding();
1892 bool X86TargetLowering::useSoftFloat() const {
1893 return Subtarget.useSoftFloat();
1897 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1898 const MachineBasicBlock *MBB,
1899 unsigned uid,MCContext &Ctx) const{
1900 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1901 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1903 return MCSymbolRefExpr::create(MBB->getSymbol(),
1904 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1907 /// Returns relocation base for the given PIC jumptable.
1908 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1909 SelectionDAG &DAG) const {
1910 if (!Subtarget.is64Bit())
1911 // This doesn't have SDLoc associated with it, but is not really the
1912 // same as a Register.
1913 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1914 getPointerTy(DAG.getDataLayout()));
1918 /// This returns the relocation base for the given PIC jumptable,
1919 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1920 const MCExpr *X86TargetLowering::
1921 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1922 MCContext &Ctx) const {
1923 // X86-64 uses RIP relative addressing based on the jump table label.
1924 if (Subtarget.isPICStyleRIPRel())
1925 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1927 // Otherwise, the reference is relative to the PIC base.
1928 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1931 std::pair<const TargetRegisterClass *, uint8_t>
1932 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1934 const TargetRegisterClass *RRC = nullptr;
1936 switch (VT.SimpleTy) {
1938 return TargetLowering::findRepresentativeClass(TRI, VT);
1939 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1940 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1943 RRC = &X86::VR64RegClass;
1945 case MVT::f32: case MVT::f64:
1946 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1947 case MVT::v4f32: case MVT::v2f64:
1948 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1950 RRC = &X86::VR128RegClass;
1953 return std::make_pair(RRC, Cost);
1956 unsigned X86TargetLowering::getAddressSpace() const {
1957 if (Subtarget.is64Bit())
1958 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1962 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1963 // glibc has a special slot for the stack guard in tcbhead_t, use it instead
1964 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
1965 if (!Subtarget.isTargetGlibc())
1966 return TargetLowering::getIRStackGuard(IRB);
1968 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1970 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
1971 unsigned AddressSpace = getAddressSpace();
1972 return ConstantExpr::getIntToPtr(
1973 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1974 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1977 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
1978 // MSVC CRT provides functionalities for stack protection.
1979 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
1980 // MSVC CRT has a global variable holding security cookie.
1981 M.getOrInsertGlobal("__security_cookie",
1982 Type::getInt8PtrTy(M.getContext()));
1984 // MSVC CRT has a function to validate security cookie.
1985 auto *SecurityCheckCookie = cast<Function>(
1986 M.getOrInsertFunction("__security_check_cookie",
1987 Type::getVoidTy(M.getContext()),
1988 Type::getInt8PtrTy(M.getContext()), nullptr));
1989 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
1990 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
1993 // glibc has a special slot for the stack guard.
1994 if (Subtarget.isTargetGlibc())
1996 TargetLowering::insertSSPDeclarations(M);
1999 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2000 // MSVC CRT has a global variable holding security cookie.
2001 if (Subtarget.getTargetTriple().isOSMSVCRT())
2002 return M.getGlobalVariable("__security_cookie");
2003 return TargetLowering::getSDagStackGuard(M);
2006 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2007 // MSVC CRT has a function to validate security cookie.
2008 if (Subtarget.getTargetTriple().isOSMSVCRT())
2009 return M.getFunction("__security_check_cookie");
2010 return TargetLowering::getSSPStackGuardCheck(M);
2013 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2014 if (!Subtarget.isTargetAndroid())
2015 return TargetLowering::getSafeStackPointerLocation(IRB);
2017 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2018 // definition of TLS_SLOT_SAFESTACK in
2019 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2020 unsigned AddressSpace, Offset;
2022 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2024 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2025 AddressSpace = getAddressSpace();
2026 return ConstantExpr::getIntToPtr(
2027 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2028 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2031 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2032 unsigned DestAS) const {
2033 assert(SrcAS != DestAS && "Expected different address spaces!");
2035 return SrcAS < 256 && DestAS < 256;
2038 //===----------------------------------------------------------------------===//
2039 // Return Value Calling Convention Implementation
2040 //===----------------------------------------------------------------------===//
2042 #include "X86GenCallingConv.inc"
2044 bool X86TargetLowering::CanLowerReturn(
2045 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2046 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2047 SmallVector<CCValAssign, 16> RVLocs;
2048 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2049 return CCInfo.CheckReturn(Outs, RetCC_X86);
2052 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2053 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2058 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2060 const SmallVectorImpl<ISD::OutputArg> &Outs,
2061 const SmallVectorImpl<SDValue> &OutVals,
2062 const SDLoc &dl, SelectionDAG &DAG) const {
2063 MachineFunction &MF = DAG.getMachineFunction();
2064 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2066 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2067 report_fatal_error("X86 interrupts may not return any value");
2069 SmallVector<CCValAssign, 16> RVLocs;
2070 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2071 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2074 SmallVector<SDValue, 6> RetOps;
2075 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2076 // Operand #1 = Bytes To Pop
2077 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2080 // Copy the result values into the output registers.
2081 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2082 CCValAssign &VA = RVLocs[i];
2083 assert(VA.isRegLoc() && "Can only return in registers!");
2084 SDValue ValToCopy = OutVals[i];
2085 EVT ValVT = ValToCopy.getValueType();
2087 // Promote values to the appropriate types.
2088 if (VA.getLocInfo() == CCValAssign::SExt)
2089 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2090 else if (VA.getLocInfo() == CCValAssign::ZExt)
2091 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2092 else if (VA.getLocInfo() == CCValAssign::AExt) {
2093 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2094 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2096 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2098 else if (VA.getLocInfo() == CCValAssign::BCvt)
2099 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2101 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2102 "Unexpected FP-extend for return value.");
2104 // If this is x86-64, and we disabled SSE, we can't return FP values,
2105 // or SSE or MMX vectors.
2106 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2107 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2108 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2109 report_fatal_error("SSE register return with SSE disabled");
2111 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2112 // llvm-gcc has never done it right and no one has noticed, so this
2113 // should be OK for now.
2114 if (ValVT == MVT::f64 &&
2115 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2116 report_fatal_error("SSE2 register return with SSE2 disabled");
2118 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2119 // the RET instruction and handled by the FP Stackifier.
2120 if (VA.getLocReg() == X86::FP0 ||
2121 VA.getLocReg() == X86::FP1) {
2122 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2123 // change the value to the FP stack register class.
2124 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2125 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2126 RetOps.push_back(ValToCopy);
2127 // Don't emit a copytoreg.
2131 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2132 // which is returned in RAX / RDX.
2133 if (Subtarget.is64Bit()) {
2134 if (ValVT == MVT::x86mmx) {
2135 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2136 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2137 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2139 // If we don't have SSE2 available, convert to v4f32 so the generated
2140 // register is legal.
2141 if (!Subtarget.hasSSE2())
2142 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2147 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2148 Flag = Chain.getValue(1);
2149 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2152 // Swift calling convention does not require we copy the sret argument
2153 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2155 // All x86 ABIs require that for returning structs by value we copy
2156 // the sret argument into %rax/%eax (depending on ABI) for the return.
2157 // We saved the argument into a virtual register in the entry block,
2158 // so now we copy the value out and into %rax/%eax.
2160 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2161 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2162 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2163 // either case FuncInfo->setSRetReturnReg() will have been called.
2164 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2165 // When we have both sret and another return value, we should use the
2166 // original Chain stored in RetOps[0], instead of the current Chain updated
2167 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2169 // For the case of sret and another return value, we have
2170 // Chain_0 at the function entry
2171 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2172 // If we use Chain_1 in getCopyFromReg, we will have
2173 // Val = getCopyFromReg(Chain_1)
2174 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2176 // getCopyToReg(Chain_0) will be glued together with
2177 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2178 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2179 // Data dependency from Unit B to Unit A due to usage of Val in
2180 // getCopyToReg(Chain_1, Val)
2181 // Chain dependency from Unit A to Unit B
2183 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2184 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2185 getPointerTy(MF.getDataLayout()));
2188 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2189 X86::RAX : X86::EAX;
2190 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2191 Flag = Chain.getValue(1);
2193 // RAX/EAX now acts like a return value.
2195 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2198 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2199 const MCPhysReg *I =
2200 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2203 if (X86::GR64RegClass.contains(*I))
2204 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2206 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2210 RetOps[0] = Chain; // Update chain.
2212 // Add the flag if we have it.
2214 RetOps.push_back(Flag);
2216 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2217 if (CallConv == CallingConv::X86_INTR)
2218 opcode = X86ISD::IRET;
2219 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2222 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2223 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2226 SDValue TCChain = Chain;
2227 SDNode *Copy = *N->use_begin();
2228 if (Copy->getOpcode() == ISD::CopyToReg) {
2229 // If the copy has a glue operand, we conservatively assume it isn't safe to
2230 // perform a tail call.
2231 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2233 TCChain = Copy->getOperand(0);
2234 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2237 bool HasRet = false;
2238 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2240 if (UI->getOpcode() != X86ISD::RET_FLAG)
2242 // If we are returning more than one value, we can definitely
2243 // not make a tail call see PR19530
2244 if (UI->getNumOperands() > 4)
2246 if (UI->getNumOperands() == 4 &&
2247 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2259 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2260 ISD::NodeType ExtendKind) const {
2261 MVT ReturnMVT = MVT::i32;
2263 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2264 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2265 // The ABI does not require i1, i8 or i16 to be extended.
2267 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2268 // always extending i8/i16 return values, so keep doing that for now.
2270 ReturnMVT = MVT::i8;
2273 EVT MinVT = getRegisterType(Context, ReturnMVT);
2274 return VT.bitsLT(MinVT) ? MinVT : VT;
2277 /// Lower the result values of a call into the
2278 /// appropriate copies out of appropriate physical registers.
2280 SDValue X86TargetLowering::LowerCallResult(
2281 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2282 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2283 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2285 // Assign locations to each value returned by this call.
2286 SmallVector<CCValAssign, 16> RVLocs;
2287 bool Is64Bit = Subtarget.is64Bit();
2288 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2290 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2292 // Copy all of the result registers out of their specified physreg.
2293 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2294 CCValAssign &VA = RVLocs[i];
2295 EVT CopyVT = VA.getLocVT();
2297 // If this is x86-64, and we disabled SSE, we can't return FP values
2298 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2299 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2300 report_fatal_error("SSE register return with SSE disabled");
2303 // If we prefer to use the value in xmm registers, copy it out as f80 and
2304 // use a truncate to move it from fp stack reg to xmm reg.
2305 bool RoundAfterCopy = false;
2306 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2307 isScalarFPTypeInSSEReg(VA.getValVT())) {
2308 if (!Subtarget.hasX87())
2309 report_fatal_error("X87 register return with X87 disabled");
2311 RoundAfterCopy = (CopyVT != VA.getLocVT());
2314 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2315 CopyVT, InFlag).getValue(1);
2316 SDValue Val = Chain.getValue(0);
2319 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2320 // This truncation won't change the value.
2321 DAG.getIntPtrConstant(1, dl));
2323 if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
2324 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2326 InFlag = Chain.getValue(2);
2327 InVals.push_back(Val);
2333 //===----------------------------------------------------------------------===//
2334 // C & StdCall & Fast Calling Convention implementation
2335 //===----------------------------------------------------------------------===//
2336 // StdCall calling convention seems to be standard for many Windows' API
2337 // routines and around. It differs from C calling convention just a little:
2338 // callee should clean up the stack, not caller. Symbols should be also
2339 // decorated in some fancy way :) It doesn't support any vector arguments.
2340 // For info on fast calling convention see Fast Calling Convention (tail call)
2341 // implementation LowerX86_32FastCCCallTo.
2343 /// CallIsStructReturn - Determines whether a call uses struct return
2345 enum StructReturnType {
2350 static StructReturnType
2351 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2353 return NotStructReturn;
2355 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2356 if (!Flags.isSRet())
2357 return NotStructReturn;
2358 if (Flags.isInReg() || IsMCU)
2359 return RegStructReturn;
2360 return StackStructReturn;
2363 /// Determines whether a function uses struct return semantics.
2364 static StructReturnType
2365 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2367 return NotStructReturn;
2369 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2370 if (!Flags.isSRet())
2371 return NotStructReturn;
2372 if (Flags.isInReg() || IsMCU)
2373 return RegStructReturn;
2374 return StackStructReturn;
2377 /// Make a copy of an aggregate at address specified by "Src" to address
2378 /// "Dst" with size and alignment information specified by the specific
2379 /// parameter attribute. The copy will be passed as a byval function parameter.
2380 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2381 SDValue Chain, ISD::ArgFlagsTy Flags,
2382 SelectionDAG &DAG, const SDLoc &dl) {
2383 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2385 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2386 /*isVolatile*/false, /*AlwaysInline=*/true,
2387 /*isTailCall*/false,
2388 MachinePointerInfo(), MachinePointerInfo());
2391 /// Return true if the calling convention is one that we can guarantee TCO for.
2392 static bool canGuaranteeTCO(CallingConv::ID CC) {
2393 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2394 CC == CallingConv::HiPE || CC == CallingConv::HHVM);
2397 /// Return true if we might ever do TCO for calls with this calling convention.
2398 static bool mayTailCallThisCC(CallingConv::ID CC) {
2400 // C calling conventions:
2401 case CallingConv::C:
2402 case CallingConv::X86_64_Win64:
2403 case CallingConv::X86_64_SysV:
2404 // Callee pop conventions:
2405 case CallingConv::X86_ThisCall:
2406 case CallingConv::X86_StdCall:
2407 case CallingConv::X86_VectorCall:
2408 case CallingConv::X86_FastCall:
2411 return canGuaranteeTCO(CC);
2415 /// Return true if the function is being made into a tailcall target by
2416 /// changing its ABI.
2417 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2418 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2421 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2423 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2424 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2428 CallingConv::ID CalleeCC = CS.getCallingConv();
2429 if (!mayTailCallThisCC(CalleeCC))
2436 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2437 const SmallVectorImpl<ISD::InputArg> &Ins,
2438 const SDLoc &dl, SelectionDAG &DAG,
2439 const CCValAssign &VA,
2440 MachineFrameInfo *MFI, unsigned i) const {
2441 // Create the nodes corresponding to a load from this parameter slot.
2442 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2443 bool AlwaysUseMutable = shouldGuaranteeTCO(
2444 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2445 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2448 // If value is passed by pointer we have address passed instead of the value
2450 bool ExtendedInMem = VA.isExtInLoc() &&
2451 VA.getValVT().getScalarType() == MVT::i1;
2453 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2454 ValVT = VA.getLocVT();
2456 ValVT = VA.getValVT();
2458 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2459 // taken by a return address.
2461 if (CallConv == CallingConv::X86_INTR) {
2462 const X86Subtarget& Subtarget =
2463 static_cast<const X86Subtarget&>(DAG.getSubtarget());
2464 // X86 interrupts may take one or two arguments.
2465 // On the stack there will be no return address as in regular call.
2466 // Offset of last argument need to be set to -4/-8 bytes.
2467 // Where offset of the first argument out of two, should be set to 0 bytes.
2468 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2471 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2472 // changed with more analysis.
2473 // In case of tail call optimization mark all arguments mutable. Since they
2474 // could be overwritten by lowering of arguments in case of a tail call.
2475 if (Flags.isByVal()) {
2476 unsigned Bytes = Flags.getByValSize();
2477 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2478 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2479 // Adjust SP offset of interrupt parameter.
2480 if (CallConv == CallingConv::X86_INTR) {
2481 MFI->setObjectOffset(FI, Offset);
2483 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2485 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2486 VA.getLocMemOffset(), isImmutable);
2488 // Set SExt or ZExt flag.
2489 if (VA.getLocInfo() == CCValAssign::ZExt) {
2490 MFI->setObjectZExt(FI, true);
2491 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2492 MFI->setObjectSExt(FI, true);
2495 // Adjust SP offset of interrupt parameter.
2496 if (CallConv == CallingConv::X86_INTR) {
2497 MFI->setObjectOffset(FI, Offset);
2500 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2501 SDValue Val = DAG.getLoad(
2502 ValVT, dl, Chain, FIN,
2503 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2504 return ExtendedInMem ?
2505 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2509 // FIXME: Get this from tablegen.
2510 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2511 const X86Subtarget &Subtarget) {
2512 assert(Subtarget.is64Bit());
2514 if (Subtarget.isCallingConvWin64(CallConv)) {
2515 static const MCPhysReg GPR64ArgRegsWin64[] = {
2516 X86::RCX, X86::RDX, X86::R8, X86::R9
2518 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2521 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2522 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2524 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2527 // FIXME: Get this from tablegen.
2528 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2529 CallingConv::ID CallConv,
2530 const X86Subtarget &Subtarget) {
2531 assert(Subtarget.is64Bit());
2532 if (Subtarget.isCallingConvWin64(CallConv)) {
2533 // The XMM registers which might contain var arg parameters are shadowed
2534 // in their paired GPR. So we only need to save the GPR to their home
2536 // TODO: __vectorcall will change this.
2540 const Function *Fn = MF.getFunction();
2541 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2542 bool isSoftFloat = Subtarget.useSoftFloat();
2543 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2544 "SSE register cannot be used when SSE is disabled!");
2545 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2546 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2550 static const MCPhysReg XMMArgRegs64Bit[] = {
2551 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2552 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2554 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2557 SDValue X86TargetLowering::LowerFormalArguments(
2558 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2559 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2560 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2561 MachineFunction &MF = DAG.getMachineFunction();
2562 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2563 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2565 const Function *Fn = MF.getFunction();
2566 if (Fn->hasExternalLinkage() &&
2567 Subtarget.isTargetCygMing() &&
2568 Fn->getName() == "main")
2569 FuncInfo->setForceFramePointer(true);
2571 MachineFrameInfo *MFI = MF.getFrameInfo();
2572 bool Is64Bit = Subtarget.is64Bit();
2573 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2575 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2576 "Var args not supported with calling convention fastcc, ghc or hipe");
2578 if (CallConv == CallingConv::X86_INTR) {
2579 bool isLegal = Ins.size() == 1 ||
2580 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2581 (!Is64Bit && Ins[1].VT == MVT::i32)));
2583 report_fatal_error("X86 interrupts may take one or two arguments");
2586 // Assign locations to all of the incoming arguments.
2587 SmallVector<CCValAssign, 16> ArgLocs;
2588 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2590 // Allocate shadow area for Win64
2592 CCInfo.AllocateStack(32, 8);
2594 CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2596 unsigned LastVal = ~0U;
2598 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2599 CCValAssign &VA = ArgLocs[i];
2600 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2602 assert(VA.getValNo() != LastVal &&
2603 "Don't support value assigned to multiple locs yet");
2605 LastVal = VA.getValNo();
2607 if (VA.isRegLoc()) {
2608 EVT RegVT = VA.getLocVT();
2609 const TargetRegisterClass *RC;
2610 if (RegVT == MVT::i32)
2611 RC = &X86::GR32RegClass;
2612 else if (Is64Bit && RegVT == MVT::i64)
2613 RC = &X86::GR64RegClass;
2614 else if (RegVT == MVT::f32)
2615 RC = &X86::FR32RegClass;
2616 else if (RegVT == MVT::f64)
2617 RC = &X86::FR64RegClass;
2618 else if (RegVT == MVT::f128)
2619 RC = &X86::FR128RegClass;
2620 else if (RegVT.is512BitVector())
2621 RC = &X86::VR512RegClass;
2622 else if (RegVT.is256BitVector())
2623 RC = &X86::VR256RegClass;
2624 else if (RegVT.is128BitVector())
2625 RC = &X86::VR128RegClass;
2626 else if (RegVT == MVT::x86mmx)
2627 RC = &X86::VR64RegClass;
2628 else if (RegVT == MVT::i1)
2629 RC = &X86::VK1RegClass;
2630 else if (RegVT == MVT::v8i1)
2631 RC = &X86::VK8RegClass;
2632 else if (RegVT == MVT::v16i1)
2633 RC = &X86::VK16RegClass;
2634 else if (RegVT == MVT::v32i1)
2635 RC = &X86::VK32RegClass;
2636 else if (RegVT == MVT::v64i1)
2637 RC = &X86::VK64RegClass;
2639 llvm_unreachable("Unknown argument type!");
2641 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2642 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2644 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2645 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2647 if (VA.getLocInfo() == CCValAssign::SExt)
2648 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2649 DAG.getValueType(VA.getValVT()));
2650 else if (VA.getLocInfo() == CCValAssign::ZExt)
2651 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2652 DAG.getValueType(VA.getValVT()));
2653 else if (VA.getLocInfo() == CCValAssign::BCvt)
2654 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2656 if (VA.isExtInLoc()) {
2657 // Handle MMX values passed in XMM regs.
2658 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2659 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2661 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2664 assert(VA.isMemLoc());
2665 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2668 // If value is passed via pointer - do a load.
2669 if (VA.getLocInfo() == CCValAssign::Indirect)
2671 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
2673 InVals.push_back(ArgValue);
2676 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2677 // Swift calling convention does not require we copy the sret argument
2678 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2679 if (CallConv == CallingConv::Swift)
2682 // All x86 ABIs require that for returning structs by value we copy the
2683 // sret argument into %rax/%eax (depending on ABI) for the return. Save
2684 // the argument into a virtual register so that we can access it from the
2686 if (Ins[i].Flags.isSRet()) {
2687 unsigned Reg = FuncInfo->getSRetReturnReg();
2689 MVT PtrTy = getPointerTy(DAG.getDataLayout());
2690 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2691 FuncInfo->setSRetReturnReg(Reg);
2693 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2694 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2699 unsigned StackSize = CCInfo.getNextStackOffset();
2700 // Align stack specially for tail calls.
2701 if (shouldGuaranteeTCO(CallConv,
2702 MF.getTarget().Options.GuaranteedTailCallOpt))
2703 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2705 // If the function takes variable number of arguments, make a frame index for
2706 // the start of the first vararg value... for expansion of llvm.va_start. We
2707 // can skip this if there are no va_start calls.
2708 if (MFI->hasVAStart() &&
2709 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2710 CallConv != CallingConv::X86_ThisCall))) {
2711 FuncInfo->setVarArgsFrameIndex(
2712 MFI->CreateFixedObject(1, StackSize, true));
2715 // Figure out if XMM registers are in use.
2716 assert(!(Subtarget.useSoftFloat() &&
2717 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2718 "SSE register cannot be used when SSE is disabled!");
2720 // 64-bit calling conventions support varargs and register parameters, so we
2721 // have to do extra work to spill them in the prologue.
2722 if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2723 // Find the first unallocated argument registers.
2724 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2725 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2726 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2727 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2728 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2729 "SSE register cannot be used when SSE is disabled!");
2731 // Gather all the live in physical registers.
2732 SmallVector<SDValue, 6> LiveGPRs;
2733 SmallVector<SDValue, 8> LiveXMMRegs;
2735 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2736 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2738 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2740 if (!ArgXMMs.empty()) {
2741 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2742 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2743 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2744 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2745 LiveXMMRegs.push_back(
2746 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2751 // Get to the caller-allocated home save location. Add 8 to account
2752 // for the return address.
2753 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2754 FuncInfo->setRegSaveFrameIndex(
2755 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2756 // Fixup to set vararg frame on shadow area (4 x i64).
2758 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2760 // For X86-64, if there are vararg parameters that are passed via
2761 // registers, then we must store them to their spots on the stack so
2762 // they may be loaded by dereferencing the result of va_next.
2763 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2764 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2765 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2766 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2769 // Store the integer parameter registers.
2770 SmallVector<SDValue, 8> MemOps;
2771 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2772 getPointerTy(DAG.getDataLayout()));
2773 unsigned Offset = FuncInfo->getVarArgsGPOffset();
2774 for (SDValue Val : LiveGPRs) {
2775 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2776 RSFIN, DAG.getIntPtrConstant(Offset, dl));
2778 DAG.getStore(Val.getValue(1), dl, Val, FIN,
2779 MachinePointerInfo::getFixedStack(
2780 DAG.getMachineFunction(),
2781 FuncInfo->getRegSaveFrameIndex(), Offset));
2782 MemOps.push_back(Store);
2786 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2787 // Now store the XMM (fp + vector) parameter registers.
2788 SmallVector<SDValue, 12> SaveXMMOps;
2789 SaveXMMOps.push_back(Chain);
2790 SaveXMMOps.push_back(ALVal);
2791 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2792 FuncInfo->getRegSaveFrameIndex(), dl));
2793 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2794 FuncInfo->getVarArgsFPOffset(), dl));
2795 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2797 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2798 MVT::Other, SaveXMMOps));
2801 if (!MemOps.empty())
2802 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2805 if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2806 // Find the largest legal vector type.
2807 MVT VecVT = MVT::Other;
2808 // FIXME: Only some x86_32 calling conventions support AVX512.
2809 if (Subtarget.hasAVX512() &&
2810 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2811 CallConv == CallingConv::Intel_OCL_BI)))
2812 VecVT = MVT::v16f32;
2813 else if (Subtarget.hasAVX())
2815 else if (Subtarget.hasSSE2())
2818 // We forward some GPRs and some vector types.
2819 SmallVector<MVT, 2> RegParmTypes;
2820 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2821 RegParmTypes.push_back(IntVT);
2822 if (VecVT != MVT::Other)
2823 RegParmTypes.push_back(VecVT);
2825 // Compute the set of forwarded registers. The rest are scratch.
2826 SmallVectorImpl<ForwardedRegister> &Forwards =
2827 FuncInfo->getForwardedMustTailRegParms();
2828 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2830 // Conservatively forward AL on x86_64, since it might be used for varargs.
2831 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2832 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2833 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2836 // Copy all forwards from physical to virtual registers.
2837 for (ForwardedRegister &F : Forwards) {
2838 // FIXME: Can we use a less constrained schedule?
2839 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2840 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2841 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2845 // Some CCs need callee pop.
2846 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2847 MF.getTarget().Options.GuaranteedTailCallOpt)) {
2848 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2849 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
2850 // X86 interrupts must pop the error code if present
2851 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
2853 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2854 // If this is an sret function, the return should pop the hidden pointer.
2855 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
2856 !Subtarget.getTargetTriple().isOSMSVCRT() &&
2857 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
2858 FuncInfo->setBytesToPopOnReturn(4);
2862 // RegSaveFrameIndex is X86-64 only.
2863 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2864 if (CallConv == CallingConv::X86_FastCall ||
2865 CallConv == CallingConv::X86_ThisCall)
2866 // fastcc functions can't have varargs.
2867 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2870 FuncInfo->setArgumentStackSize(StackSize);
2872 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
2873 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
2874 if (Personality == EHPersonality::CoreCLR) {
2876 // TODO: Add a mechanism to frame lowering that will allow us to indicate
2877 // that we'd prefer this slot be allocated towards the bottom of the frame
2878 // (i.e. near the stack pointer after allocating the frame). Every
2879 // funclet needs a copy of this slot in its (mostly empty) frame, and the
2880 // offset from the bottom of this and each funclet's frame must be the
2881 // same, so the size of funclets' (mostly empty) frames is dictated by
2882 // how far this slot is from the bottom (since they allocate just enough
2883 // space to accommodate holding this slot at the correct offset).
2884 int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
2885 EHInfo->PSPSymFrameIdx = PSPSymFI;
2892 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
2893 SDValue Arg, const SDLoc &dl,
2895 const CCValAssign &VA,
2896 ISD::ArgFlagsTy Flags) const {
2897 unsigned LocMemOffset = VA.getLocMemOffset();
2898 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2899 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2901 if (Flags.isByVal())
2902 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2904 return DAG.getStore(
2905 Chain, dl, Arg, PtrOff,
2906 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
2909 /// Emit a load of return address if tail call
2910 /// optimization is performed and it is required.
2911 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
2912 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
2913 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
2914 // Adjust the Return address stack slot.
2915 EVT VT = getPointerTy(DAG.getDataLayout());
2916 OutRetAddr = getReturnAddressFrameIndex(DAG);
2918 // Load the "old" Return address.
2919 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
2920 return SDValue(OutRetAddr.getNode(), 1);
2923 /// Emit a store of the return address if tail call
2924 /// optimization is performed and it is required (FPDiff!=0).
2925 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2926 SDValue Chain, SDValue RetAddrFrIdx,
2927 EVT PtrVT, unsigned SlotSize,
2928 int FPDiff, const SDLoc &dl) {
2929 // Store the return address to the appropriate stack slot.
2930 if (!FPDiff) return Chain;
2931 // Calculate the new stack slot for the return address.
2932 int NewReturnAddrFI =
2933 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2935 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2936 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2937 MachinePointerInfo::getFixedStack(
2938 DAG.getMachineFunction(), NewReturnAddrFI));
2942 /// Returns a vector_shuffle mask for an movs{s|d}, movd
2943 /// operation of specified width.
2944 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
2946 unsigned NumElems = VT.getVectorNumElements();
2947 SmallVector<int, 8> Mask;
2948 Mask.push_back(NumElems);
2949 for (unsigned i = 1; i != NumElems; ++i)
2951 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
2955 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2956 SmallVectorImpl<SDValue> &InVals) const {
2957 SelectionDAG &DAG = CLI.DAG;
2959 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2960 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2961 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2962 SDValue Chain = CLI.Chain;
2963 SDValue Callee = CLI.Callee;
2964 CallingConv::ID CallConv = CLI.CallConv;
2965 bool &isTailCall = CLI.IsTailCall;
2966 bool isVarArg = CLI.IsVarArg;
2968 MachineFunction &MF = DAG.getMachineFunction();
2969 bool Is64Bit = Subtarget.is64Bit();
2970 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2971 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
2972 bool IsSibcall = false;
2973 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2974 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
2976 if (CallConv == CallingConv::X86_INTR)
2977 report_fatal_error("X86 interrupts may not be called directly");
2979 if (Attr.getValueAsString() == "true")
2982 if (Subtarget.isPICStyleGOT() &&
2983 !MF.getTarget().Options.GuaranteedTailCallOpt) {
2984 // If we are using a GOT, disable tail calls to external symbols with
2985 // default visibility. Tail calling such a symbol requires using a GOT
2986 // relocation, which forces early binding of the symbol. This breaks code
2987 // that require lazy function symbol resolution. Using musttail or
2988 // GuaranteedTailCallOpt will override this.
2989 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2990 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2991 G->getGlobal()->hasDefaultVisibility()))
2995 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2997 // Force this to be a tail call. The verifier rules are enough to ensure
2998 // that we can lower this successfully without moving the return address
3001 } else if (isTailCall) {
3002 // Check if it's really possible to do a tail call.
3003 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3004 isVarArg, SR != NotStructReturn,
3005 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3006 Outs, OutVals, Ins, DAG);
3008 // Sibcalls are automatically detected tailcalls which do not require
3010 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3017 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3018 "Var args not supported with calling convention fastcc, ghc or hipe");
3020 // Analyze operands of the call, assigning locations to each operand.
3021 SmallVector<CCValAssign, 16> ArgLocs;
3022 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3024 // Allocate shadow area for Win64
3026 CCInfo.AllocateStack(32, 8);
3028 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3030 // Get a count of how many bytes are to be pushed on the stack.
3031 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3033 // This is a sibcall. The memory operands are available in caller's
3034 // own caller's stack.
3036 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3037 canGuaranteeTCO(CallConv))
3038 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3041 if (isTailCall && !IsSibcall && !IsMustTail) {
3042 // Lower arguments at fp - stackoffset + fpdiff.
3043 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3045 FPDiff = NumBytesCallerPushed - NumBytes;
3047 // Set the delta of movement of the returnaddr stackslot.
3048 // But only set if delta is greater than previous delta.
3049 if (FPDiff < X86Info->getTCReturnAddrDelta())
3050 X86Info->setTCReturnAddrDelta(FPDiff);
3053 unsigned NumBytesToPush = NumBytes;
3054 unsigned NumBytesToPop = NumBytes;
3056 // If we have an inalloca argument, all stack space has already been allocated
3057 // for us and be right at the top of the stack. We don't support multiple
3058 // arguments passed in memory when using inalloca.
3059 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3061 if (!ArgLocs.back().isMemLoc())
3062 report_fatal_error("cannot use inalloca attribute on a register "
3064 if (ArgLocs.back().getLocMemOffset() != 0)
3065 report_fatal_error("any parameter with the inalloca attribute must be "
3066 "the only memory argument");
3070 Chain = DAG.getCALLSEQ_START(
3071 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3073 SDValue RetAddrFrIdx;
3074 // Load return address for tail calls.
3075 if (isTailCall && FPDiff)
3076 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3077 Is64Bit, FPDiff, dl);
3079 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3080 SmallVector<SDValue, 8> MemOpChains;
3083 // Walk the register/memloc assignments, inserting copies/loads. In the case
3084 // of tail call optimization arguments are handle later.
3085 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3086 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3087 // Skip inalloca arguments, they have already been written.
3088 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3089 if (Flags.isInAlloca())
3092 CCValAssign &VA = ArgLocs[i];
3093 EVT RegVT = VA.getLocVT();
3094 SDValue Arg = OutVals[i];
3095 bool isByVal = Flags.isByVal();
3097 // Promote the value if needed.
3098 switch (VA.getLocInfo()) {
3099 default: llvm_unreachable("Unknown loc info!");
3100 case CCValAssign::Full: break;
3101 case CCValAssign::SExt:
3102 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3104 case CCValAssign::ZExt:
3105 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3107 case CCValAssign::AExt:
3108 if (Arg.getValueType().isVector() &&
3109 Arg.getValueType().getVectorElementType() == MVT::i1)
3110 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3111 else if (RegVT.is128BitVector()) {
3112 // Special case: passing MMX values in XMM registers.
3113 Arg = DAG.getBitcast(MVT::i64, Arg);
3114 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3115 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3117 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3119 case CCValAssign::BCvt:
3120 Arg = DAG.getBitcast(RegVT, Arg);
3122 case CCValAssign::Indirect: {
3123 // Store the argument.
3124 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3125 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3126 Chain = DAG.getStore(
3127 Chain, dl, Arg, SpillSlot,
3128 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3134 if (VA.isRegLoc()) {
3135 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3136 if (isVarArg && IsWin64) {
3137 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3138 // shadow reg if callee is a varargs function.
3139 unsigned ShadowReg = 0;
3140 switch (VA.getLocReg()) {
3141 case X86::XMM0: ShadowReg = X86::RCX; break;
3142 case X86::XMM1: ShadowReg = X86::RDX; break;
3143 case X86::XMM2: ShadowReg = X86::R8; break;
3144 case X86::XMM3: ShadowReg = X86::R9; break;
3147 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3149 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3150 assert(VA.isMemLoc());
3151 if (!StackPtr.getNode())
3152 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3153 getPointerTy(DAG.getDataLayout()));
3154 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3155 dl, DAG, VA, Flags));
3159 if (!MemOpChains.empty())
3160 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3162 if (Subtarget.isPICStyleGOT()) {
3163 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3166 RegsToPass.push_back(std::make_pair(
3167 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3168 getPointerTy(DAG.getDataLayout()))));
3170 // If we are tail calling and generating PIC/GOT style code load the
3171 // address of the callee into ECX. The value in ecx is used as target of
3172 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3173 // for tail calls on PIC/GOT architectures. Normally we would just put the
3174 // address of GOT into ebx and then call target@PLT. But for tail calls
3175 // ebx would be restored (since ebx is callee saved) before jumping to the
3178 // Note: The actual moving to ECX is done further down.
3179 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3180 if (G && !G->getGlobal()->hasLocalLinkage() &&
3181 G->getGlobal()->hasDefaultVisibility())
3182 Callee = LowerGlobalAddress(Callee, DAG);
3183 else if (isa<ExternalSymbolSDNode>(Callee))
3184 Callee = LowerExternalSymbol(Callee, DAG);
3188 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3189 // From AMD64 ABI document:
3190 // For calls that may call functions that use varargs or stdargs
3191 // (prototype-less calls or calls to functions containing ellipsis (...) in
3192 // the declaration) %al is used as hidden argument to specify the number
3193 // of SSE registers used. The contents of %al do not need to match exactly
3194 // the number of registers, but must be an ubound on the number of SSE
3195 // registers used and is in the range 0 - 8 inclusive.
3197 // Count the number of XMM registers allocated.
3198 static const MCPhysReg XMMArgRegs[] = {
3199 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3200 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3202 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3203 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3204 && "SSE registers cannot be used when SSE is disabled");
3206 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3207 DAG.getConstant(NumXMMRegs, dl,
3211 if (isVarArg && IsMustTail) {
3212 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3213 for (const auto &F : Forwards) {
3214 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3215 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3219 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3220 // don't need this because the eligibility check rejects calls that require
3221 // shuffling arguments passed in memory.
3222 if (!IsSibcall && isTailCall) {
3223 // Force all the incoming stack arguments to be loaded from the stack
3224 // before any new outgoing arguments are stored to the stack, because the
3225 // outgoing stack slots may alias the incoming argument stack slots, and
3226 // the alias isn't otherwise explicit. This is slightly more conservative
3227 // than necessary, because it means that each store effectively depends
3228 // on every argument instead of just those arguments it would clobber.
3229 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3231 SmallVector<SDValue, 8> MemOpChains2;
3234 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3235 CCValAssign &VA = ArgLocs[i];
3238 assert(VA.isMemLoc());
3239 SDValue Arg = OutVals[i];
3240 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3241 // Skip inalloca arguments. They don't require any work.
3242 if (Flags.isInAlloca())
3244 // Create frame index.
3245 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3246 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3247 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3248 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3250 if (Flags.isByVal()) {
3251 // Copy relative to framepointer.
3252 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3253 if (!StackPtr.getNode())
3254 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3255 getPointerTy(DAG.getDataLayout()));
3256 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3259 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3263 // Store relative to framepointer.
3264 MemOpChains2.push_back(DAG.getStore(
3265 ArgChain, dl, Arg, FIN,
3266 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3270 if (!MemOpChains2.empty())
3271 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3273 // Store the return address to the appropriate stack slot.
3274 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3275 getPointerTy(DAG.getDataLayout()),
3276 RegInfo->getSlotSize(), FPDiff, dl);
3279 // Build a sequence of copy-to-reg nodes chained together with token chain
3280 // and flag operands which copy the outgoing args into registers.
3282 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3283 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3284 RegsToPass[i].second, InFlag);
3285 InFlag = Chain.getValue(1);
3288 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3289 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3290 // In the 64-bit large code model, we have to make all calls
3291 // through a register, since the call instruction's 32-bit
3292 // pc-relative offset may not be large enough to hold the whole
3294 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3295 // If the callee is a GlobalAddress node (quite common, every direct call
3296 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3298 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3300 // We should use extra load for direct calls to dllimported functions in
3302 const GlobalValue *GV = G->getGlobal();
3303 if (!GV->hasDLLImportStorageClass()) {
3304 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3306 Callee = DAG.getTargetGlobalAddress(
3307 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3309 if (OpFlags == X86II::MO_GOTPCREL) {
3311 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3312 getPointerTy(DAG.getDataLayout()), Callee);
3313 // Add extra indirection
3314 Callee = DAG.getLoad(
3315 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3316 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3319 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3320 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3321 unsigned char OpFlags =
3322 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3324 Callee = DAG.getTargetExternalSymbol(
3325 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3326 } else if (Subtarget.isTarget64BitILP32() &&
3327 Callee->getValueType(0) == MVT::i32) {
3328 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3329 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3332 // Returns a chain & a flag for retval copy to use.
3333 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3334 SmallVector<SDValue, 8> Ops;
3336 if (!IsSibcall && isTailCall) {
3337 Chain = DAG.getCALLSEQ_END(Chain,
3338 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3339 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3340 InFlag = Chain.getValue(1);
3343 Ops.push_back(Chain);
3344 Ops.push_back(Callee);
3347 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3349 // Add argument registers to the end of the list so that they are known live
3351 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3352 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3353 RegsToPass[i].second.getValueType()));
3355 // Add a register mask operand representing the call-preserved registers.
3356 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3357 assert(Mask && "Missing call preserved mask for calling convention");
3359 // If this is an invoke in a 32-bit function using a funclet-based
3360 // personality, assume the function clobbers all registers. If an exception
3361 // is thrown, the runtime will not restore CSRs.
3362 // FIXME: Model this more precisely so that we can register allocate across
3363 // the normal edge and spill and fill across the exceptional edge.
3364 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3365 const Function *CallerFn = MF.getFunction();
3366 EHPersonality Pers =
3367 CallerFn->hasPersonalityFn()
3368 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3369 : EHPersonality::Unknown;
3370 if (isFuncletEHPersonality(Pers))
3371 Mask = RegInfo->getNoPreservedMask();
3374 Ops.push_back(DAG.getRegisterMask(Mask));
3376 if (InFlag.getNode())
3377 Ops.push_back(InFlag);
3381 //// If this is the first return lowered for this function, add the regs
3382 //// to the liveout set for the function.
3383 // This isn't right, although it's probably harmless on x86; liveouts
3384 // should be computed from returns not tail calls. Consider a void
3385 // function making a tail call to a function returning int.
3386 MF.getFrameInfo()->setHasTailCall();
3387 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3390 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3391 InFlag = Chain.getValue(1);
3393 // Create the CALLSEQ_END node.
3394 unsigned NumBytesForCalleeToPop;
3395 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3396 DAG.getTarget().Options.GuaranteedTailCallOpt))
3397 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3398 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3399 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3400 SR == StackStructReturn)
3401 // If this is a call to a struct-return function, the callee
3402 // pops the hidden struct pointer, so we have to push it back.
3403 // This is common for Darwin/X86, Linux & Mingw32 targets.
3404 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3405 NumBytesForCalleeToPop = 4;
3407 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3409 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3410 // No need to reset the stack after the call if the call doesn't return. To
3411 // make the MI verify, we'll pretend the callee does it for us.
3412 NumBytesForCalleeToPop = NumBytes;
3415 // Returns a flag for retval copy to use.
3417 Chain = DAG.getCALLSEQ_END(Chain,
3418 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3419 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3422 InFlag = Chain.getValue(1);
3425 // Handle result values, copying them out of physregs into vregs that we
3427 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3428 Ins, dl, DAG, InVals);
3431 //===----------------------------------------------------------------------===//
3432 // Fast Calling Convention (tail call) implementation
3433 //===----------------------------------------------------------------------===//
3435 // Like std call, callee cleans arguments, convention except that ECX is
3436 // reserved for storing the tail called function address. Only 2 registers are
3437 // free for argument passing (inreg). Tail call optimization is performed
3439 // * tailcallopt is enabled
3440 // * caller/callee are fastcc
3441 // On X86_64 architecture with GOT-style position independent code only local
3442 // (within module) calls are supported at the moment.
3443 // To keep the stack aligned according to platform abi the function
3444 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3445 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3446 // If a tail called function callee has more arguments than the caller the
3447 // caller needs to make sure that there is room to move the RETADDR to. This is
3448 // achieved by reserving an area the size of the argument delta right after the
3449 // original RETADDR, but before the saved framepointer or the spilled registers
3450 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3462 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3465 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3466 SelectionDAG& DAG) const {
3467 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3468 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3469 unsigned StackAlignment = TFI.getStackAlignment();
3470 uint64_t AlignMask = StackAlignment - 1;
3471 int64_t Offset = StackSize;
3472 unsigned SlotSize = RegInfo->getSlotSize();
3473 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3474 // Number smaller than 12 so just add the difference.
3475 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3477 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3478 Offset = ((~AlignMask) & Offset) + StackAlignment +
3479 (StackAlignment-SlotSize);
3484 /// Return true if the given stack call argument is already available in the
3485 /// same position (relatively) of the caller's incoming argument stack.
3487 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3488 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3489 const X86InstrInfo *TII, const CCValAssign &VA) {
3490 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3493 // Look through nodes that don't alter the bits of the incoming value.
3494 unsigned Op = Arg.getOpcode();
3495 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3496 Arg = Arg.getOperand(0);
3499 if (Op == ISD::TRUNCATE) {
3500 const SDValue &TruncInput = Arg.getOperand(0);
3501 if (TruncInput.getOpcode() == ISD::AssertZext &&
3502 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3503 Arg.getValueType()) {
3504 Arg = TruncInput.getOperand(0);
3512 if (Arg.getOpcode() == ISD::CopyFromReg) {
3513 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3514 if (!TargetRegisterInfo::isVirtualRegister(VR))
3516 MachineInstr *Def = MRI->getVRegDef(VR);
3519 if (!Flags.isByVal()) {
3520 if (!TII->isLoadFromStackSlot(*Def, FI))
3523 unsigned Opcode = Def->getOpcode();
3524 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3525 Opcode == X86::LEA64_32r) &&
3526 Def->getOperand(1).isFI()) {
3527 FI = Def->getOperand(1).getIndex();
3528 Bytes = Flags.getByValSize();
3532 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3533 if (Flags.isByVal())
3534 // ByVal argument is passed in as a pointer but it's now being
3535 // dereferenced. e.g.
3536 // define @foo(%struct.X* %A) {
3537 // tail call @bar(%struct.X* byval %A)
3540 SDValue Ptr = Ld->getBasePtr();
3541 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3544 FI = FINode->getIndex();
3545 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3546 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3547 FI = FINode->getIndex();
3548 Bytes = Flags.getByValSize();
3552 assert(FI != INT_MAX);
3553 if (!MFI->isFixedObjectIndex(FI))
3556 if (Offset != MFI->getObjectOffset(FI))
3559 if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
3560 // If the argument location is wider than the argument type, check that any
3561 // extension flags match.
3562 if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
3563 Flags.isSExt() != MFI->isObjectSExt(FI)) {
3568 return Bytes == MFI->getObjectSize(FI);
3571 /// Check whether the call is eligible for tail call optimization. Targets
3572 /// that want to do tail call optimization should implement this function.
3573 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3574 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3575 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3576 const SmallVectorImpl<ISD::OutputArg> &Outs,
3577 const SmallVectorImpl<SDValue> &OutVals,
3578 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3579 if (!mayTailCallThisCC(CalleeCC))
3582 // If -tailcallopt is specified, make fastcc functions tail-callable.
3583 MachineFunction &MF = DAG.getMachineFunction();
3584 const Function *CallerF = MF.getFunction();
3586 // If the function return type is x86_fp80 and the callee return type is not,
3587 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3588 // perform a tailcall optimization here.
3589 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3592 CallingConv::ID CallerCC = CallerF->getCallingConv();
3593 bool CCMatch = CallerCC == CalleeCC;
3594 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3595 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3597 // Win64 functions have extra shadow space for argument homing. Don't do the
3598 // sibcall if the caller and callee have mismatched expectations for this
3600 if (IsCalleeWin64 != IsCallerWin64)
3603 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3604 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3609 // Look for obvious safe cases to perform tail call optimization that do not
3610 // require ABI changes. This is what gcc calls sibcall.
3612 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3613 // emit a special epilogue.
3614 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3615 if (RegInfo->needsStackRealignment(MF))
3618 // Also avoid sibcall optimization if either caller or callee uses struct
3619 // return semantics.
3620 if (isCalleeStructRet || isCallerStructRet)
3623 // Do not sibcall optimize vararg calls unless all arguments are passed via
3625 LLVMContext &C = *DAG.getContext();
3626 if (isVarArg && !Outs.empty()) {
3627 // Optimizing for varargs on Win64 is unlikely to be safe without
3628 // additional testing.
3629 if (IsCalleeWin64 || IsCallerWin64)
3632 SmallVector<CCValAssign, 16> ArgLocs;
3633 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3635 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3636 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3637 if (!ArgLocs[i].isRegLoc())
3641 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3642 // stack. Therefore, if it's not used by the call it is not safe to optimize
3643 // this into a sibcall.
3644 bool Unused = false;
3645 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3652 SmallVector<CCValAssign, 16> RVLocs;
3653 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3654 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3655 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3656 CCValAssign &VA = RVLocs[i];
3657 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3662 // Check that the call results are passed in the same way.
3663 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3664 RetCC_X86, RetCC_X86))
3666 // The callee has to preserve all registers the caller needs to preserve.
3667 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3668 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3670 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3671 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3675 unsigned StackArgsSize = 0;
3677 // If the callee takes no arguments then go on to check the results of the
3679 if (!Outs.empty()) {
3680 // Check if stack adjustment is needed. For now, do not do this if any
3681 // argument is passed on the stack.
3682 SmallVector<CCValAssign, 16> ArgLocs;
3683 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3685 // Allocate shadow area for Win64
3687 CCInfo.AllocateStack(32, 8);
3689 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3690 StackArgsSize = CCInfo.getNextStackOffset();
3692 if (CCInfo.getNextStackOffset()) {
3693 // Check if the arguments are already laid out in the right way as
3694 // the caller's fixed stack objects.
3695 MachineFrameInfo *MFI = MF.getFrameInfo();
3696 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3697 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3698 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3699 CCValAssign &VA = ArgLocs[i];
3700 SDValue Arg = OutVals[i];
3701 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3702 if (VA.getLocInfo() == CCValAssign::Indirect)
3704 if (!VA.isRegLoc()) {
3705 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3712 bool PositionIndependent = isPositionIndependent();
3713 // If the tailcall address may be in a register, then make sure it's
3714 // possible to register allocate for it. In 32-bit, the call address can
3715 // only target EAX, EDX, or ECX since the tail call must be scheduled after
3716 // callee-saved registers are restored. These happen to be the same
3717 // registers used to pass 'inreg' arguments so watch out for those.
3718 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
3719 !isa<ExternalSymbolSDNode>(Callee)) ||
3720 PositionIndependent)) {
3721 unsigned NumInRegs = 0;
3722 // In PIC we need an extra register to formulate the address computation
3724 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
3726 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3727 CCValAssign &VA = ArgLocs[i];
3730 unsigned Reg = VA.getLocReg();
3733 case X86::EAX: case X86::EDX: case X86::ECX:
3734 if (++NumInRegs == MaxInRegs)
3741 const MachineRegisterInfo &MRI = MF.getRegInfo();
3742 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3746 bool CalleeWillPop =
3747 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
3748 MF.getTarget().Options.GuaranteedTailCallOpt);
3750 if (unsigned BytesToPop =
3751 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
3752 // If we have bytes to pop, the callee must pop them.
3753 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
3754 if (!CalleePopMatches)
3756 } else if (CalleeWillPop && StackArgsSize > 0) {
3757 // If we don't have bytes to pop, make sure the callee doesn't pop any.
3765 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3766 const TargetLibraryInfo *libInfo) const {
3767 return X86::createFastISel(funcInfo, libInfo);
3770 //===----------------------------------------------------------------------===//
3771 // Other Lowering Hooks
3772 //===----------------------------------------------------------------------===//
3774 static bool MayFoldLoad(SDValue Op) {
3775 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3778 static bool MayFoldIntoStore(SDValue Op) {
3779 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3782 static bool isTargetShuffle(unsigned Opcode) {
3784 default: return false;
3785 case X86ISD::BLENDI:
3786 case X86ISD::PSHUFB:
3787 case X86ISD::PSHUFD:
3788 case X86ISD::PSHUFHW:
3789 case X86ISD::PSHUFLW:
3791 case X86ISD::INSERTPS:
3792 case X86ISD::PALIGNR:
3793 case X86ISD::VSHLDQ:
3794 case X86ISD::VSRLDQ:
3795 case X86ISD::MOVLHPS:
3796 case X86ISD::MOVLHPD:
3797 case X86ISD::MOVHLPS:
3798 case X86ISD::MOVLPS:
3799 case X86ISD::MOVLPD:
3800 case X86ISD::MOVSHDUP:
3801 case X86ISD::MOVSLDUP:
3802 case X86ISD::MOVDDUP:
3805 case X86ISD::UNPCKL:
3806 case X86ISD::UNPCKH:
3807 case X86ISD::VBROADCAST:
3808 case X86ISD::VPERMILPI:
3809 case X86ISD::VPERMILPV:
3810 case X86ISD::VPERM2X128:
3811 case X86ISD::VPERMIL2:
3812 case X86ISD::VPERMI:
3813 case X86ISD::VPPERM:
3814 case X86ISD::VPERMV:
3815 case X86ISD::VPERMV3:
3816 case X86ISD::VZEXT_MOVL:
3821 static bool isTargetShuffleVariableMask(unsigned Opcode) {
3823 default: return false;
3824 case X86ISD::PSHUFB:
3825 case X86ISD::VPERMILPV:
3830 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3831 SDValue V1, unsigned TargetMask,
3832 SelectionDAG &DAG) {
3834 default: llvm_unreachable("Unknown x86 shuffle node");
3835 case X86ISD::PSHUFD:
3836 case X86ISD::PSHUFHW:
3837 case X86ISD::PSHUFLW:
3838 case X86ISD::VPERMILPI:
3839 case X86ISD::VPERMI:
3840 return DAG.getNode(Opc, dl, VT, V1,
3841 DAG.getConstant(TargetMask, dl, MVT::i8));
3845 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3846 SDValue V1, SDValue V2, SelectionDAG &DAG) {
3848 default: llvm_unreachable("Unknown x86 shuffle node");
3849 case X86ISD::MOVLHPS:
3850 case X86ISD::MOVLHPD:
3851 case X86ISD::MOVHLPS:
3852 case X86ISD::MOVLPS:
3853 case X86ISD::MOVLPD:
3856 case X86ISD::UNPCKL:
3857 case X86ISD::UNPCKH:
3858 return DAG.getNode(Opc, dl, VT, V1, V2);
3862 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3863 MachineFunction &MF = DAG.getMachineFunction();
3864 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3865 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3866 int ReturnAddrIndex = FuncInfo->getRAIndex();
3868 if (ReturnAddrIndex == 0) {
3869 // Set up a frame object for the return address.
3870 unsigned SlotSize = RegInfo->getSlotSize();
3871 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3874 FuncInfo->setRAIndex(ReturnAddrIndex);
3877 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3880 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3881 bool hasSymbolicDisplacement) {
3882 // Offset should fit into 32 bit immediate field.
3883 if (!isInt<32>(Offset))
3886 // If we don't have a symbolic displacement - we don't have any extra
3888 if (!hasSymbolicDisplacement)
3891 // FIXME: Some tweaks might be needed for medium code model.
3892 if (M != CodeModel::Small && M != CodeModel::Kernel)
3895 // For small code model we assume that latest object is 16MB before end of 31
3896 // bits boundary. We may also accept pretty large negative constants knowing
3897 // that all objects are in the positive half of address space.
3898 if (M == CodeModel::Small && Offset < 16*1024*1024)
3901 // For kernel code model we know that all object resist in the negative half
3902 // of 32bits address space. We may not accept negative offsets, since they may
3903 // be just off and we may accept pretty large positive ones.
3904 if (M == CodeModel::Kernel && Offset >= 0)
3910 /// Determines whether the callee is required to pop its own arguments.
3911 /// Callee pop is necessary to support tail calls.
3912 bool X86::isCalleePop(CallingConv::ID CallingConv,
3913 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
3914 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
3915 // can guarantee TCO.
3916 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
3919 switch (CallingConv) {
3922 case CallingConv::X86_StdCall:
3923 case CallingConv::X86_FastCall:
3924 case CallingConv::X86_ThisCall:
3925 case CallingConv::X86_VectorCall:
3930 /// \brief Return true if the condition is an unsigned comparison operation.
3931 static bool isX86CCUnsigned(unsigned X86CC) {
3934 llvm_unreachable("Invalid integer condition!");
3950 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
3951 switch (SetCCOpcode) {
3952 default: llvm_unreachable("Invalid integer condition!");
3953 case ISD::SETEQ: return X86::COND_E;
3954 case ISD::SETGT: return X86::COND_G;
3955 case ISD::SETGE: return X86::COND_GE;
3956 case ISD::SETLT: return X86::COND_L;
3957 case ISD::SETLE: return X86::COND_LE;
3958 case ISD::SETNE: return X86::COND_NE;
3959 case ISD::SETULT: return X86::COND_B;
3960 case ISD::SETUGT: return X86::COND_A;
3961 case ISD::SETULE: return X86::COND_BE;
3962 case ISD::SETUGE: return X86::COND_AE;
3966 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3967 /// condition code, returning the condition code and the LHS/RHS of the
3968 /// comparison to make.
3969 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
3970 bool isFP, SDValue &LHS, SDValue &RHS,
3971 SelectionDAG &DAG) {
3973 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3974 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3975 // X > -1 -> X == 0, jump !sign.
3976 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3977 return X86::COND_NS;
3979 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3980 // X < 0 -> X == 0, jump on sign.
3983 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3985 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3986 return X86::COND_LE;
3990 return TranslateIntegerX86CC(SetCCOpcode);
3993 // First determine if it is required or is profitable to flip the operands.
3995 // If LHS is a foldable load, but RHS is not, flip the condition.
3996 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3997 !ISD::isNON_EXTLoad(RHS.getNode())) {
3998 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3999 std::swap(LHS, RHS);
4002 switch (SetCCOpcode) {
4008 std::swap(LHS, RHS);
4012 // On a floating point condition, the flags are set as follows:
4014 // 0 | 0 | 0 | X > Y
4015 // 0 | 0 | 1 | X < Y
4016 // 1 | 0 | 0 | X == Y
4017 // 1 | 1 | 1 | unordered
4018 switch (SetCCOpcode) {
4019 default: llvm_unreachable("Condcode should be pre-legalized away");
4021 case ISD::SETEQ: return X86::COND_E;
4022 case ISD::SETOLT: // flipped
4024 case ISD::SETGT: return X86::COND_A;
4025 case ISD::SETOLE: // flipped
4027 case ISD::SETGE: return X86::COND_AE;
4028 case ISD::SETUGT: // flipped
4030 case ISD::SETLT: return X86::COND_B;
4031 case ISD::SETUGE: // flipped
4033 case ISD::SETLE: return X86::COND_BE;
4035 case ISD::SETNE: return X86::COND_NE;
4036 case ISD::SETUO: return X86::COND_P;
4037 case ISD::SETO: return X86::COND_NP;
4039 case ISD::SETUNE: return X86::COND_INVALID;
4043 /// Is there a floating point cmov for the specific X86 condition code?
4044 /// Current x86 isa includes the following FP cmov instructions:
4045 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4046 static bool hasFPCMov(unsigned X86CC) {
4063 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4065 unsigned Intrinsic) const {
4067 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4071 Info.opc = ISD::INTRINSIC_W_CHAIN;
4072 Info.readMem = false;
4073 Info.writeMem = false;
4077 switch (IntrData->Type) {
4078 case EXPAND_FROM_MEM: {
4079 Info.ptrVal = I.getArgOperand(0);
4080 Info.memVT = MVT::getVT(I.getType());
4082 Info.readMem = true;
4085 case COMPRESS_TO_MEM: {
4086 Info.ptrVal = I.getArgOperand(0);
4087 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4089 Info.writeMem = true;
4092 case TRUNCATE_TO_MEM_VI8:
4093 case TRUNCATE_TO_MEM_VI16:
4094 case TRUNCATE_TO_MEM_VI32: {
4095 Info.ptrVal = I.getArgOperand(0);
4096 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4097 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4098 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4100 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4101 ScalarVT = MVT::i16;
4102 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4103 ScalarVT = MVT::i32;
4105 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4107 Info.writeMem = true;
4117 /// Returns true if the target can instruction select the
4118 /// specified FP immediate natively. If false, the legalizer will
4119 /// materialize the FP immediate as a load from a constant pool.
4120 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4121 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4122 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4128 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4129 ISD::LoadExtType ExtTy,
4131 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4132 // relocation target a movq or addq instruction: don't let the load shrink.
4133 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4134 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4135 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4136 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4140 /// \brief Returns true if it is beneficial to convert a load of a constant
4141 /// to just the constant itself.
4142 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4144 assert(Ty->isIntegerTy());
4146 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4147 if (BitSize == 0 || BitSize > 64)
4152 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4153 unsigned Index) const {
4154 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4157 return (Index == 0 || Index == ResVT.getVectorNumElements());
4160 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4161 // Speculate cttz only if we can directly use TZCNT.
4162 return Subtarget.hasBMI();
4165 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4166 // Speculate ctlz only if we can directly use LZCNT.
4167 return Subtarget.hasLZCNT();
4170 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4171 if (!Subtarget.hasBMI())
4174 // There are only 32-bit and 64-bit forms for 'andn'.
4175 EVT VT = Y.getValueType();
4176 if (VT != MVT::i32 && VT != MVT::i64)
4182 /// Return true if every element in Mask, beginning
4183 /// from position Pos and ending in Pos+Size is undef.
4184 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4185 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4191 /// Return true if Val is undef or if its value falls within the
4192 /// specified range (L, H].
4193 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4194 return (Val < 0) || (Val >= Low && Val < Hi);
4197 /// Return true if every element in Mask is undef or if its value
4198 /// falls within the specified range (L, H].
4199 static bool isUndefOrInRange(ArrayRef<int> Mask,
4202 if (!isUndefOrInRange(M, Low, Hi))
4207 /// Val is either less than zero (undef) or equal to the specified value.
4208 static bool isUndefOrEqual(int Val, int CmpVal) {
4209 return (Val < 0 || Val == CmpVal);
4212 /// Val is either the undef or zero sentinel value.
4213 static bool isUndefOrZero(int Val) {
4214 return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
4217 /// Return true if every element in Mask, beginning
4218 /// from position Pos and ending in Pos+Size, falls within the specified
4219 /// sequential range (Low, Low+Size]. or is undef.
4220 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4221 unsigned Pos, unsigned Size, int Low) {
4222 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4223 if (!isUndefOrEqual(Mask[i], Low))
4228 /// Return true if every element in Mask, beginning
4229 /// from position Pos and ending in Pos+Size, falls within the specified
4230 /// sequential range (Low, Low+Size], or is undef or is zero.
4231 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4232 unsigned Size, int Low) {
4233 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4234 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4239 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4240 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4241 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4242 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4243 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4246 // The index should be aligned on a vecWidth-bit boundary.
4248 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4250 MVT VT = N->getSimpleValueType(0);
4251 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4252 bool Result = (Index * ElSize) % vecWidth == 0;
4257 /// Return true if the specified INSERT_SUBVECTOR
4258 /// operand specifies a subvector insert that is suitable for input to
4259 /// insertion of 128 or 256-bit subvectors
4260 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4261 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4262 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4264 // The index should be aligned on a vecWidth-bit boundary.
4266 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4268 MVT VT = N->getSimpleValueType(0);
4269 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4270 bool Result = (Index * ElSize) % vecWidth == 0;
4275 bool X86::isVINSERT128Index(SDNode *N) {
4276 return isVINSERTIndex(N, 128);
4279 bool X86::isVINSERT256Index(SDNode *N) {
4280 return isVINSERTIndex(N, 256);
4283 bool X86::isVEXTRACT128Index(SDNode *N) {
4284 return isVEXTRACTIndex(N, 128);
4287 bool X86::isVEXTRACT256Index(SDNode *N) {
4288 return isVEXTRACTIndex(N, 256);
4291 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4292 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4293 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4294 "Illegal extract subvector for VEXTRACT");
4297 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4299 MVT VecVT = N->getOperand(0).getSimpleValueType();
4300 MVT ElVT = VecVT.getVectorElementType();
4302 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4303 return Index / NumElemsPerChunk;
4306 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4307 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4308 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4309 "Illegal insert subvector for VINSERT");
4312 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4314 MVT VecVT = N->getSimpleValueType(0);
4315 MVT ElVT = VecVT.getVectorElementType();
4317 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4318 return Index / NumElemsPerChunk;
4321 /// Return the appropriate immediate to extract the specified
4322 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4323 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4324 return getExtractVEXTRACTImmediate(N, 128);
4327 /// Return the appropriate immediate to extract the specified
4328 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4329 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4330 return getExtractVEXTRACTImmediate(N, 256);
4333 /// Return the appropriate immediate to insert at the specified
4334 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4335 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4336 return getInsertVINSERTImmediate(N, 128);
4339 /// Return the appropriate immediate to insert at the specified
4340 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4341 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4342 return getInsertVINSERTImmediate(N, 256);
4345 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4346 bool X86::isZeroNode(SDValue Elt) {
4347 return isNullConstant(Elt) || isNullFPConstant(Elt);
4350 // Build a vector of constants
4351 // Use an UNDEF node if MaskElt == -1.
4352 // Spilt 64-bit constants in the 32-bit mode.
4353 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4354 const SDLoc &dl, bool IsMask = false) {
4356 SmallVector<SDValue, 32> Ops;
4359 MVT ConstVecVT = VT;
4360 unsigned NumElts = VT.getVectorNumElements();
4361 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4362 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4363 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4367 MVT EltVT = ConstVecVT.getVectorElementType();
4368 for (unsigned i = 0; i < NumElts; ++i) {
4369 bool IsUndef = Values[i] < 0 && IsMask;
4370 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4371 DAG.getConstant(Values[i], dl, EltVT);
4372 Ops.push_back(OpNode);
4374 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4375 DAG.getConstant(0, dl, EltVT));
4377 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4379 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4383 /// Returns a vector of specified type with all zero elements.
4384 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4385 SelectionDAG &DAG, const SDLoc &dl) {
4386 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4387 VT.getVectorElementType() == MVT::i1) &&
4388 "Unexpected vector type");
4390 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4391 // type. This ensures they get CSE'd. But if the integer type is not
4392 // available, use a floating-point +0.0 instead.
4394 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4395 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4396 } else if (VT.getVectorElementType() == MVT::i1) {
4397 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4398 "Unexpected vector type");
4399 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4400 "Unexpected vector type");
4401 Vec = DAG.getConstant(0, dl, VT);
4403 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4404 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4406 return DAG.getBitcast(VT, Vec);
4409 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4410 const SDLoc &dl, unsigned vectorWidth) {
4411 assert((vectorWidth == 128 || vectorWidth == 256) &&
4412 "Unsupported vector width");
4413 EVT VT = Vec.getValueType();
4414 EVT ElVT = VT.getVectorElementType();
4415 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4416 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4417 VT.getVectorNumElements()/Factor);
4419 // Extract from UNDEF is UNDEF.
4421 return DAG.getUNDEF(ResultVT);
4423 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4424 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4425 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4427 // This is the index of the first element of the vectorWidth-bit chunk
4428 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4429 IdxVal &= ~(ElemsPerChunk - 1);
4431 // If the input is a buildvector just emit a smaller one.
4432 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4433 return DAG.getNode(ISD::BUILD_VECTOR,
4434 dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4436 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4437 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4440 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4441 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4442 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4443 /// instructions or a simple subregister reference. Idx is an index in the
4444 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4445 /// lowering EXTRACT_VECTOR_ELT operations easier.
4446 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4447 SelectionDAG &DAG, const SDLoc &dl) {
4448 assert((Vec.getValueType().is256BitVector() ||
4449 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4450 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4453 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4454 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4455 SelectionDAG &DAG, const SDLoc &dl) {
4456 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4457 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4460 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4461 SelectionDAG &DAG, const SDLoc &dl,
4462 unsigned vectorWidth) {
4463 assert((vectorWidth == 128 || vectorWidth == 256) &&
4464 "Unsupported vector width");
4465 // Inserting UNDEF is Result
4468 EVT VT = Vec.getValueType();
4469 EVT ElVT = VT.getVectorElementType();
4470 EVT ResultVT = Result.getValueType();
4472 // Insert the relevant vectorWidth bits.
4473 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4474 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4476 // This is the index of the first element of the vectorWidth-bit chunk
4477 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4478 IdxVal &= ~(ElemsPerChunk - 1);
4480 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4481 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4484 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4485 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4486 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4487 /// simple superregister reference. Idx is an index in the 128 bits
4488 /// we want. It need not be aligned to a 128-bit boundary. That makes
4489 /// lowering INSERT_VECTOR_ELT operations easier.
4490 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4491 SelectionDAG &DAG, const SDLoc &dl) {
4492 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4494 // For insertion into the zero index (low half) of a 256-bit vector, it is
4495 // more efficient to generate a blend with immediate instead of an insert*128.
4496 // We are still creating an INSERT_SUBVECTOR below with an undef node to
4497 // extend the subvector to the size of the result vector. Make sure that
4498 // we are not recursing on that node by checking for undef here.
4499 if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4500 !Result.isUndef()) {
4501 EVT ResultVT = Result.getValueType();
4502 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4503 SDValue Undef = DAG.getUNDEF(ResultVT);
4504 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4507 // The blend instruction, and therefore its mask, depend on the data type.
4508 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4509 if (ScalarType.isFloatingPoint()) {
4510 // Choose either vblendps (float) or vblendpd (double).
4511 unsigned ScalarSize = ScalarType.getSizeInBits();
4512 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4513 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4514 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4515 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4518 const X86Subtarget &Subtarget =
4519 static_cast<const X86Subtarget &>(DAG.getSubtarget());
4521 // AVX2 is needed for 256-bit integer blend support.
4522 // Integers must be cast to 32-bit because there is only vpblendd;
4523 // vpblendw can't be used for this because it has a handicapped mask.
4525 // If we don't have AVX2, then cast to float. Using a wrong domain blend
4526 // is still more efficient than using the wrong domain vinsertf128 that
4527 // will be created by InsertSubVector().
4528 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4530 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4531 Result = DAG.getBitcast(CastVT, Result);
4532 Vec256 = DAG.getBitcast(CastVT, Vec256);
4533 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4534 return DAG.getBitcast(ResultVT, Vec256);
4537 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4540 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4541 SelectionDAG &DAG, const SDLoc &dl) {
4542 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4543 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4546 /// Insert i1-subvector to i1-vector.
4547 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4548 const X86Subtarget &Subtarget) {
4551 SDValue Vec = Op.getOperand(0);
4552 SDValue SubVec = Op.getOperand(1);
4553 SDValue Idx = Op.getOperand(2);
4555 if (!isa<ConstantSDNode>(Idx))
4558 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4559 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4562 MVT OpVT = Op.getSimpleValueType();
4563 MVT SubVecVT = SubVec.getSimpleValueType();
4564 unsigned NumElems = OpVT.getVectorNumElements();
4565 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4567 assert(IdxVal + SubVecNumElems <= NumElems &&
4568 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4569 "Unexpected index value in INSERT_SUBVECTOR");
4571 // There are 3 possible cases:
4572 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
4573 // 2. Subvector should be inserted in the upper part
4574 // (IdxVal + SubVecNumElems == NumElems)
4575 // 3. Subvector should be inserted in the middle (for example v2i1
4576 // to v16i1, index 2)
4578 // extend to natively supported kshift
4579 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4580 MVT WideOpVT = OpVT;
4581 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
4584 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4585 SDValue Undef = DAG.getUNDEF(WideOpVT);
4586 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4587 Undef, SubVec, ZeroIdx);
4589 // Extract sub-vector if require.
4590 auto ExtractSubVec = [&](SDValue V) {
4591 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
4595 if (Vec.isUndef()) {
4597 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
4598 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
4600 return ExtractSubVec(WideSubVec);
4603 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
4604 NumElems = WideOpVT.getVectorNumElements();
4605 unsigned ShiftLeft = NumElems - SubVecNumElems;
4606 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4607 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4608 DAG.getConstant(ShiftLeft, dl, MVT::i8));
4609 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
4610 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
4611 return ExtractSubVec(Vec);
4615 // Zero lower bits of the Vec
4616 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4617 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4618 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4619 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4620 // Merge them together, SubVec should be zero extended.
4621 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4622 getZeroVector(WideOpVT, Subtarget, DAG, dl),
4624 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4625 return ExtractSubVec(Vec);
4628 // Simple case when we put subvector in the upper part
4629 if (IdxVal + SubVecNumElems == NumElems) {
4630 // Zero upper bits of the Vec
4631 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4632 DAG.getConstant(IdxVal, dl, MVT::i8));
4633 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4634 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4635 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4636 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4637 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4638 return ExtractSubVec(Vec);
4640 // Subvector should be inserted in the middle - use shuffle
4641 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
4643 SmallVector<int, 64> Mask;
4644 for (unsigned i = 0; i < NumElems; ++i)
4645 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
4647 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
4650 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
4651 /// instructions. This is used because creating CONCAT_VECTOR nodes of
4652 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
4653 /// large BUILD_VECTORS.
4654 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
4655 unsigned NumElems, SelectionDAG &DAG,
4657 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4658 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
4661 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
4662 unsigned NumElems, SelectionDAG &DAG,
4664 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4665 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
4668 /// Returns a vector of specified type with all bits set.
4669 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4670 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
4671 /// Then bitcast to their original type, ensuring they get CSE'd.
4672 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
4673 SelectionDAG &DAG, const SDLoc &dl) {
4674 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4675 "Expected a 128/256/512-bit vector type");
4677 APInt Ones = APInt::getAllOnesValue(32);
4678 unsigned NumElts = VT.getSizeInBits() / 32;
4680 if (!Subtarget.hasInt256() && NumElts == 8) {
4681 Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
4682 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4684 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4686 return DAG.getBitcast(VT, Vec);
4689 /// Returns a vector_shuffle node for an unpackl operation.
4690 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4691 SDValue V1, SDValue V2) {
4692 assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4693 unsigned NumElems = VT.getVectorNumElements();
4694 SmallVector<int, 8> Mask(NumElems);
4695 for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4697 Mask[i * 2 + 1] = i + NumElems;
4699 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4702 /// Returns a vector_shuffle node for an unpackh operation.
4703 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4704 SDValue V1, SDValue V2) {
4705 assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4706 unsigned NumElems = VT.getVectorNumElements();
4707 SmallVector<int, 8> Mask(NumElems);
4708 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4709 Mask[i * 2] = i + Half;
4710 Mask[i * 2 + 1] = i + NumElems + Half;
4712 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4715 /// Return a vector_shuffle of the specified vector of zero or undef vector.
4716 /// This produces a shuffle where the low element of V2 is swizzled into the
4717 /// zero/undef vector, landing at element Idx.
4718 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4719 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
4721 const X86Subtarget &Subtarget,
4722 SelectionDAG &DAG) {
4723 MVT VT = V2.getSimpleValueType();
4725 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4726 int NumElems = VT.getVectorNumElements();
4727 SmallVector<int, 16> MaskVec(NumElems);
4728 for (int i = 0; i != NumElems; ++i)
4729 // If this is the insertion idx, put the low elt of V2 here.
4730 MaskVec[i] = (i == Idx) ? NumElems : i;
4731 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4734 static SDValue peekThroughBitcasts(SDValue V) {
4735 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
4736 V = V.getOperand(0);
4740 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
4741 unsigned MaskEltSizeInBits,
4742 SmallVectorImpl<uint64_t> &RawMask) {
4743 MaskNode = peekThroughBitcasts(MaskNode);
4745 MVT VT = MaskNode.getSimpleValueType();
4746 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
4748 // Split an APInt element into MaskEltSizeInBits sized pieces and
4749 // insert into the shuffle mask.
4750 auto SplitElementToMask = [&](APInt Element) {
4751 // Note that this is x86 and so always little endian: the low byte is
4752 // the first byte of the mask.
4753 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4754 for (int i = 0; i < Split; ++i) {
4755 APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
4756 Element = Element.lshr(MaskEltSizeInBits);
4757 RawMask.push_back(RawElt.getZExtValue());
4761 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
4762 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4763 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
4764 if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
4766 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
4767 const APInt &MaskElement = CN->getAPIntValue();
4768 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
4769 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
4770 RawMask.push_back(RawElt.getZExtValue());
4776 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
4777 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
4779 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4780 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4782 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4784 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
4785 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
4786 SplitElementToMask(CN->getAPIntValue());
4787 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
4793 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
4796 // We can always decode if the buildvector is all zero constants,
4797 // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
4798 if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
4799 RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
4803 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4804 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4807 for (SDValue Op : MaskNode->ops()) {
4808 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
4809 SplitElementToMask(CN->getAPIntValue());
4810 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
4811 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
4819 static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
4820 MaskNode = peekThroughBitcasts(MaskNode);
4822 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
4826 SDValue Ptr = MaskLoad->getBasePtr();
4827 if (Ptr->getOpcode() == X86ISD::Wrapper ||
4828 Ptr->getOpcode() == X86ISD::WrapperRIP)
4829 Ptr = Ptr->getOperand(0);
4831 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
4832 if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
4835 return dyn_cast<Constant>(MaskCP->getConstVal());
4838 /// Calculates the shuffle mask corresponding to the target-specific opcode.
4839 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
4840 /// operands in \p Ops, and returns true.
4841 /// Sets \p IsUnary to true if only one source is used. Note that this will set
4842 /// IsUnary for shuffles which use a single input multiple times, and in those
4843 /// cases it will adjust the mask to only have indices within that single input.
4844 /// It is an error to call this with non-empty Mask/Ops vectors.
4845 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
4846 SmallVectorImpl<SDValue> &Ops,
4847 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4848 unsigned NumElems = VT.getVectorNumElements();
4851 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
4852 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
4855 bool IsFakeUnary = false;
4856 switch(N->getOpcode()) {
4857 case X86ISD::BLENDI:
4858 ImmN = N->getOperand(N->getNumOperands()-1);
4859 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4862 ImmN = N->getOperand(N->getNumOperands()-1);
4863 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4864 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4866 case X86ISD::INSERTPS:
4867 ImmN = N->getOperand(N->getNumOperands()-1);
4868 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4869 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4871 case X86ISD::UNPCKH:
4872 DecodeUNPCKHMask(VT, Mask);
4873 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4875 case X86ISD::UNPCKL:
4876 DecodeUNPCKLMask(VT, Mask);
4877 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4879 case X86ISD::MOVHLPS:
4880 DecodeMOVHLPSMask(NumElems, Mask);
4881 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4883 case X86ISD::MOVLHPS:
4884 DecodeMOVLHPSMask(NumElems, Mask);
4885 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4887 case X86ISD::PALIGNR:
4888 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4889 ImmN = N->getOperand(N->getNumOperands()-1);
4890 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4892 case X86ISD::VSHLDQ:
4893 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4894 ImmN = N->getOperand(N->getNumOperands() - 1);
4895 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4898 case X86ISD::VSRLDQ:
4899 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4900 ImmN = N->getOperand(N->getNumOperands() - 1);
4901 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4904 case X86ISD::PSHUFD:
4905 case X86ISD::VPERMILPI:
4906 ImmN = N->getOperand(N->getNumOperands()-1);
4907 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4910 case X86ISD::PSHUFHW:
4911 ImmN = N->getOperand(N->getNumOperands()-1);
4912 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4915 case X86ISD::PSHUFLW:
4916 ImmN = N->getOperand(N->getNumOperands()-1);
4917 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4920 case X86ISD::VZEXT_MOVL:
4921 DecodeZeroMoveLowMask(VT, Mask);
4924 case X86ISD::VBROADCAST: {
4925 // We only decode broadcasts of same-sized vectors at the moment.
4926 if (N->getOperand(0).getValueType() == VT) {
4927 DecodeVectorBroadcast(VT, Mask);
4933 case X86ISD::VPERMILPV: {
4935 SDValue MaskNode = N->getOperand(1);
4936 unsigned MaskEltSize = VT.getScalarSizeInBits();
4937 SmallVector<uint64_t, 32> RawMask;
4938 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
4939 DecodeVPERMILPMask(VT, RawMask, Mask);
4942 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4943 DecodeVPERMILPMask(C, MaskEltSize, Mask);
4948 case X86ISD::PSHUFB: {
4950 SDValue MaskNode = N->getOperand(1);
4951 SmallVector<uint64_t, 32> RawMask;
4952 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
4953 DecodePSHUFBMask(RawMask, Mask);
4956 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4957 DecodePSHUFBMask(C, Mask);
4962 case X86ISD::VPERMI:
4963 ImmN = N->getOperand(N->getNumOperands()-1);
4964 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4969 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
4971 case X86ISD::VPERM2X128:
4972 ImmN = N->getOperand(N->getNumOperands()-1);
4973 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4974 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4976 case X86ISD::MOVSLDUP:
4977 DecodeMOVSLDUPMask(VT, Mask);
4980 case X86ISD::MOVSHDUP:
4981 DecodeMOVSHDUPMask(VT, Mask);
4984 case X86ISD::MOVDDUP:
4985 DecodeMOVDDUPMask(VT, Mask);
4988 case X86ISD::MOVLHPD:
4989 case X86ISD::MOVLPD:
4990 case X86ISD::MOVLPS:
4991 // Not yet implemented
4993 case X86ISD::VPERMIL2: {
4994 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4995 unsigned MaskEltSize = VT.getScalarSizeInBits();
4996 SDValue MaskNode = N->getOperand(2);
4997 SDValue CtrlNode = N->getOperand(3);
4998 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
4999 unsigned CtrlImm = CtrlOp->getZExtValue();
5000 SmallVector<uint64_t, 32> RawMask;
5001 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5002 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5005 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5006 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5012 case X86ISD::VPPERM: {
5013 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5014 SDValue MaskNode = N->getOperand(2);
5015 SmallVector<uint64_t, 32> RawMask;
5016 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5017 DecodeVPPERMMask(RawMask, Mask);
5020 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5021 DecodeVPPERMMask(C, Mask);
5026 case X86ISD::VPERMV: {
5028 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5029 Ops.push_back(N->getOperand(1));
5030 SDValue MaskNode = N->getOperand(0);
5031 SmallVector<uint64_t, 32> RawMask;
5032 unsigned MaskEltSize = VT.getScalarSizeInBits();
5033 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5034 DecodeVPERMVMask(RawMask, Mask);
5037 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5038 DecodeVPERMVMask(C, VT, Mask);
5043 case X86ISD::VPERMV3: {
5044 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5045 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5046 Ops.push_back(N->getOperand(0));
5047 Ops.push_back(N->getOperand(2));
5048 SDValue MaskNode = N->getOperand(1);
5049 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5050 DecodeVPERMV3Mask(C, VT, Mask);
5055 default: llvm_unreachable("unknown target shuffle node");
5058 // Empty mask indicates the decode failed.
5062 // Check if we're getting a shuffle mask with zero'd elements.
5063 if (!AllowSentinelZero)
5064 if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5067 // If we have a fake unary shuffle, the shuffle mask is spread across two
5068 // inputs that are actually the same node. Re-map the mask to always point
5069 // into the first input.
5072 if (M >= (int)Mask.size())
5075 // If we didn't already add operands in the opcode-specific code, default to
5076 // adding 1 or 2 operands starting at 0.
5078 Ops.push_back(N->getOperand(0));
5079 if (!IsUnary || IsFakeUnary)
5080 Ops.push_back(N->getOperand(1));
5086 /// Check a target shuffle mask's inputs to see if we can set any values to
5087 /// SM_SentinelZero - this is for elements that are known to be zero
5088 /// (not just zeroable) from their inputs.
5089 /// Returns true if the target shuffle mask was decoded.
5090 static bool setTargetShuffleZeroElements(SDValue N,
5091 SmallVectorImpl<int> &Mask,
5092 SmallVectorImpl<SDValue> &Ops) {
5094 if (!isTargetShuffle(N.getOpcode()))
5096 if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
5100 SDValue V1 = Ops[0];
5101 SDValue V2 = IsUnary ? V1 : Ops[1];
5103 V1 = peekThroughBitcasts(V1);
5104 V2 = peekThroughBitcasts(V2);
5106 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5109 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5113 // Determine shuffle input and normalize the mask.
5114 SDValue V = M < Size ? V1 : V2;
5117 // We are referencing an UNDEF input.
5119 Mask[i] = SM_SentinelUndef;
5123 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5124 if (V.getOpcode() != ISD::BUILD_VECTOR)
5127 // If the BUILD_VECTOR has fewer elements then the (larger) source
5128 // element must be UNDEF/ZERO.
5129 // TODO: Is it worth testing the individual bits of a constant?
5130 if ((Size % V.getNumOperands()) == 0) {
5131 int Scale = Size / V->getNumOperands();
5132 SDValue Op = V.getOperand(M / Scale);
5134 Mask[i] = SM_SentinelUndef;
5135 else if (X86::isZeroNode(Op))
5136 Mask[i] = SM_SentinelZero;
5140 // If the BUILD_VECTOR has more elements then all the (smaller) source
5141 // elements must be all UNDEF or all ZERO.
5142 if ((V.getNumOperands() % Size) == 0) {
5143 int Scale = V->getNumOperands() / Size;
5144 bool AllUndef = true;
5145 bool AllZero = true;
5146 for (int j = 0; j < Scale; ++j) {
5147 SDValue Op = V.getOperand((M * Scale) + j);
5148 AllUndef &= Op.isUndef();
5149 AllZero &= X86::isZeroNode(Op);
5152 Mask[i] = SM_SentinelUndef;
5154 Mask[i] = SM_SentinelZero;
5162 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5163 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5164 /// remaining input indices in case we now have a unary shuffle and adjust the
5165 /// Op0/Op1 inputs accordingly.
5166 /// Returns true if the target shuffle mask was decoded.
5167 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5168 SmallVectorImpl<int> &Mask) {
5169 SmallVector<SDValue, 2> Ops;
5170 if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5173 int NumElts = Mask.size();
5174 bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
5175 return 0 <= Idx && Idx < NumElts;
5177 bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
5178 [NumElts](int Idx) { return NumElts <= Idx; });
5180 Op0 = Op0InUse ? Ops[0] : SDValue();
5181 Op1 = Op1InUse ? Ops[1] : SDValue();
5183 // We're only using Op1 - commute the mask and inputs.
5184 if (!Op0InUse && Op1InUse) {
5195 /// Returns the scalar element that will make up the ith
5196 /// element of the result of the vector shuffle.
5197 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5200 return SDValue(); // Limit search depth.
5202 SDValue V = SDValue(N, 0);
5203 EVT VT = V.getValueType();
5204 unsigned Opcode = V.getOpcode();
5206 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5207 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5208 int Elt = SV->getMaskElt(Index);
5211 return DAG.getUNDEF(VT.getVectorElementType());
5213 unsigned NumElems = VT.getVectorNumElements();
5214 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5215 : SV->getOperand(1);
5216 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5219 // Recurse into target specific vector shuffles to find scalars.
5220 if (isTargetShuffle(Opcode)) {
5221 MVT ShufVT = V.getSimpleValueType();
5222 MVT ShufSVT = ShufVT.getVectorElementType();
5223 int NumElems = (int)ShufVT.getVectorNumElements();
5224 SmallVector<int, 16> ShuffleMask;
5225 SmallVector<SDValue, 16> ShuffleOps;
5228 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5231 int Elt = ShuffleMask[Index];
5232 if (Elt == SM_SentinelZero)
5233 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5234 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5235 if (Elt == SM_SentinelUndef)
5236 return DAG.getUNDEF(ShufSVT);
5238 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5239 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5240 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5244 // Actual nodes that may contain scalar elements
5245 if (Opcode == ISD::BITCAST) {
5246 V = V.getOperand(0);
5247 EVT SrcVT = V.getValueType();
5248 unsigned NumElems = VT.getVectorNumElements();
5250 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5254 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5255 return (Index == 0) ? V.getOperand(0)
5256 : DAG.getUNDEF(VT.getVectorElementType());
5258 if (V.getOpcode() == ISD::BUILD_VECTOR)
5259 return V.getOperand(Index);
5264 /// Custom lower build_vector of v16i8.
5265 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5266 unsigned NumNonZero, unsigned NumZero,
5268 const X86Subtarget &Subtarget,
5269 const TargetLowering &TLI) {
5277 // SSE4.1 - use PINSRB to insert each byte directly.
5278 if (Subtarget.hasSSE41()) {
5279 for (unsigned i = 0; i < 16; ++i) {
5280 bool isNonZero = (NonZeros & (1 << i)) != 0;
5284 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5286 V = DAG.getUNDEF(MVT::v16i8);
5289 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5290 MVT::v16i8, V, Op.getOperand(i),
5291 DAG.getIntPtrConstant(i, dl));
5298 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5299 for (unsigned i = 0; i < 16; ++i) {
5300 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5301 if (ThisIsNonZero && First) {
5303 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5305 V = DAG.getUNDEF(MVT::v8i16);
5310 SDValue ThisElt, LastElt;
5311 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5312 if (LastIsNonZero) {
5313 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5314 MVT::i16, Op.getOperand(i-1));
5316 if (ThisIsNonZero) {
5317 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5318 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5319 ThisElt, DAG.getConstant(8, dl, MVT::i8));
5321 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5325 if (ThisElt.getNode())
5326 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5327 DAG.getIntPtrConstant(i/2, dl));
5331 return DAG.getBitcast(MVT::v16i8, V);
5334 /// Custom lower build_vector of v8i16.
5335 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5336 unsigned NumNonZero, unsigned NumZero,
5338 const X86Subtarget &Subtarget,
5339 const TargetLowering &TLI) {
5346 for (unsigned i = 0; i < 8; ++i) {
5347 bool isNonZero = (NonZeros & (1 << i)) != 0;
5351 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5353 V = DAG.getUNDEF(MVT::v8i16);
5356 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5357 MVT::v8i16, V, Op.getOperand(i),
5358 DAG.getIntPtrConstant(i, dl));
5365 /// Custom lower build_vector of v4i32 or v4f32.
5366 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5367 const X86Subtarget &Subtarget,
5368 const TargetLowering &TLI) {
5369 // Find all zeroable elements.
5370 std::bitset<4> Zeroable;
5371 for (int i=0; i < 4; ++i) {
5372 SDValue Elt = Op->getOperand(i);
5373 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
5375 assert(Zeroable.size() - Zeroable.count() > 1 &&
5376 "We expect at least two non-zero elements!");
5378 // We only know how to deal with build_vector nodes where elements are either
5379 // zeroable or extract_vector_elt with constant index.
5380 SDValue FirstNonZero;
5381 unsigned FirstNonZeroIdx;
5382 for (unsigned i=0; i < 4; ++i) {
5385 SDValue Elt = Op->getOperand(i);
5386 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5387 !isa<ConstantSDNode>(Elt.getOperand(1)))
5389 // Make sure that this node is extracting from a 128-bit vector.
5390 MVT VT = Elt.getOperand(0).getSimpleValueType();
5391 if (!VT.is128BitVector())
5393 if (!FirstNonZero.getNode()) {
5395 FirstNonZeroIdx = i;
5399 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5400 SDValue V1 = FirstNonZero.getOperand(0);
5401 MVT VT = V1.getSimpleValueType();
5403 // See if this build_vector can be lowered as a blend with zero.
5405 unsigned EltMaskIdx, EltIdx;
5407 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5408 if (Zeroable[EltIdx]) {
5409 // The zero vector will be on the right hand side.
5410 Mask[EltIdx] = EltIdx+4;
5414 Elt = Op->getOperand(EltIdx);
5415 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5416 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5417 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5419 Mask[EltIdx] = EltIdx;
5423 // Let the shuffle legalizer deal with blend operations.
5424 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5425 if (V1.getSimpleValueType() != VT)
5426 V1 = DAG.getBitcast(VT, V1);
5427 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
5430 // See if we can lower this build_vector to a INSERTPS.
5431 if (!Subtarget.hasSSE41())
5434 SDValue V2 = Elt.getOperand(0);
5435 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5438 bool CanFold = true;
5439 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5443 SDValue Current = Op->getOperand(i);
5444 SDValue SrcVector = Current->getOperand(0);
5447 CanFold = SrcVector == V1 &&
5448 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5454 assert(V1.getNode() && "Expected at least two non-zero elements!");
5455 if (V1.getSimpleValueType() != MVT::v4f32)
5456 V1 = DAG.getBitcast(MVT::v4f32, V1);
5457 if (V2.getSimpleValueType() != MVT::v4f32)
5458 V2 = DAG.getBitcast(MVT::v4f32, V2);
5460 // Ok, we can emit an INSERTPS instruction.
5461 unsigned ZMask = Zeroable.to_ulong();
5463 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5464 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5466 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
5467 DAG.getIntPtrConstant(InsertPSMask, DL));
5468 return DAG.getBitcast(VT, Result);
5471 /// Return a vector logical shift node.
5472 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
5473 SelectionDAG &DAG, const TargetLowering &TLI,
5475 assert(VT.is128BitVector() && "Unknown type for VShift");
5476 MVT ShVT = MVT::v16i8;
5477 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5478 SrcOp = DAG.getBitcast(ShVT, SrcOp);
5479 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
5480 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
5481 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
5482 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5485 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
5486 SelectionDAG &DAG) {
5488 // Check if the scalar load can be widened into a vector load. And if
5489 // the address is "base + cst" see if the cst can be "absorbed" into
5490 // the shuffle mask.
5491 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5492 SDValue Ptr = LD->getBasePtr();
5493 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5495 EVT PVT = LD->getValueType(0);
5496 if (PVT != MVT::i32 && PVT != MVT::f32)
5501 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5502 FI = FINode->getIndex();
5504 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5505 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5506 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5507 Offset = Ptr.getConstantOperandVal(1);
5508 Ptr = Ptr.getOperand(0);
5513 // FIXME: 256-bit vector instructions don't require a strict alignment,
5514 // improve this code to support it better.
5515 unsigned RequiredAlign = VT.getSizeInBits()/8;
5516 SDValue Chain = LD->getChain();
5517 // Make sure the stack object alignment is at least 16 or 32.
5518 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5519 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5520 if (MFI->isFixedObjectIndex(FI)) {
5521 // Can't change the alignment. FIXME: It's possible to compute
5522 // the exact stack offset and reference FI + adjust offset instead.
5523 // If someone *really* cares about this. That's the way to implement it.
5526 MFI->setObjectAlignment(FI, RequiredAlign);
5530 // (Offset % 16 or 32) must be multiple of 4. Then address is then
5531 // Ptr + (Offset & ~15).
5534 if ((Offset % RequiredAlign) & 3)
5536 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
5539 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5540 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
5543 int EltNo = (Offset - StartOffset) >> 2;
5544 unsigned NumElems = VT.getVectorNumElements();
5546 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5547 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5548 LD->getPointerInfo().getWithOffset(StartOffset));
5550 SmallVector<int, 8> Mask(NumElems, EltNo);
5552 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
5558 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
5559 /// elements can be replaced by a single large load which has the same value as
5560 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
5562 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
5563 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
5564 SDLoc &DL, SelectionDAG &DAG,
5565 bool isAfterLegalize) {
5566 unsigned NumElems = Elts.size();
5568 int LastLoadedElt = -1;
5569 SmallBitVector LoadMask(NumElems, false);
5570 SmallBitVector ZeroMask(NumElems, false);
5571 SmallBitVector UndefMask(NumElems, false);
5573 // For each element in the initializer, see if we've found a load, zero or an
5575 for (unsigned i = 0; i < NumElems; ++i) {
5576 SDValue Elt = peekThroughBitcasts(Elts[i]);
5581 UndefMask[i] = true;
5582 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
5584 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
5587 // Each loaded element must be the correct fractional portion of the
5588 // requested vector load.
5589 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
5594 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
5595 "Incomplete element masks");
5597 // Handle Special Cases - all undef or undef/zero.
5598 if (UndefMask.count() == NumElems)
5599 return DAG.getUNDEF(VT);
5601 // FIXME: Should we return this as a BUILD_VECTOR instead?
5602 if ((ZeroMask | UndefMask).count() == NumElems)
5603 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
5604 : DAG.getConstantFP(0.0, DL, VT);
5606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5607 int FirstLoadedElt = LoadMask.find_first();
5608 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
5609 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
5610 EVT LDBaseVT = EltBase.getValueType();
5612 // Consecutive loads can contain UNDEFS but not ZERO elements.
5613 // Consecutive loads with UNDEFs and ZEROs elements require a
5614 // an additional shuffle stage to clear the ZERO elements.
5615 bool IsConsecutiveLoad = true;
5616 bool IsConsecutiveLoadWithZeros = true;
5617 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
5619 SDValue Elt = peekThroughBitcasts(Elts[i]);
5620 LoadSDNode *LD = cast<LoadSDNode>(Elt);
5621 if (!DAG.areNonVolatileConsecutiveLoads(
5622 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
5623 i - FirstLoadedElt)) {
5624 IsConsecutiveLoad = false;
5625 IsConsecutiveLoadWithZeros = false;
5628 } else if (ZeroMask[i]) {
5629 IsConsecutiveLoad = false;
5633 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
5634 auto MMOFlags = LDBase->getMemOperand()->getFlags();
5635 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
5636 "Cannot merge volatile loads.");
5638 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5639 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
5641 if (LDBase->hasAnyUseOfValue(1)) {
5643 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5644 SDValue(NewLd.getNode(), 1));
5645 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5646 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5647 SDValue(NewLd.getNode(), 1));
5653 // LOAD - all consecutive load/undefs (must start/end with a load).
5654 // If we have found an entire vector of loads and undefs, then return a large
5655 // load of the entire vector width starting at the base pointer.
5656 // If the vector contains zeros, then attempt to shuffle those elements.
5657 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
5658 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
5659 assert(LDBase && "Did not find base load for merging consecutive loads");
5660 EVT EltVT = LDBase->getValueType(0);
5661 // Ensure that the input vector size for the merged loads matches the
5662 // cumulative size of the input elements.
5663 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
5666 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
5669 if (IsConsecutiveLoad)
5670 return CreateLoad(VT, LDBase);
5672 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
5673 // vector and a zero vector to clear out the zero elements.
5674 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
5675 SmallVector<int, 4> ClearMask(NumElems, -1);
5676 for (unsigned i = 0; i < NumElems; ++i) {
5678 ClearMask[i] = i + NumElems;
5679 else if (LoadMask[i])
5682 SDValue V = CreateLoad(VT, LDBase);
5683 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
5684 : DAG.getConstantFP(0.0, DL, VT);
5685 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
5690 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
5692 // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
5693 if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
5694 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5695 MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
5696 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
5697 if (TLI.isTypeLegal(VecVT)) {
5698 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
5699 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5701 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
5702 LDBase->getPointerInfo(),
5703 LDBase->getAlignment(),
5704 false/*isVolatile*/, true/*ReadMem*/,
5707 // Make sure the newly-created LOAD is in the same position as LDBase in
5708 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
5709 // and update uses of LDBase's output chain to use the TokenFactor.
5710 if (LDBase->hasAnyUseOfValue(1)) {
5712 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5713 SDValue(ResNode.getNode(), 1));
5714 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5715 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5716 SDValue(ResNode.getNode(), 1));
5719 return DAG.getBitcast(VT, ResNode);
5723 // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
5724 if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
5725 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5726 MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
5727 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
5728 if (TLI.isTypeLegal(VecVT)) {
5729 SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
5730 : DAG.getBitcast(VecSVT, EltBase);
5731 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
5732 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
5733 return DAG.getBitcast(VT, V);
5740 /// Attempt to use the vbroadcast instruction to generate a splat value for the
5741 /// following cases:
5742 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5743 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5744 /// a scalar load, or a constant.
5745 /// The VBROADCAST node is returned when a pattern is found,
5746 /// or SDValue() otherwise.
5747 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
5748 SelectionDAG &DAG) {
5749 // VBROADCAST requires AVX.
5750 // TODO: Splats could be generated for non-AVX CPUs using SSE
5751 // instructions, but there's less potential gain for only 128-bit vectors.
5752 if (!Subtarget.hasAVX())
5755 MVT VT = Op.getSimpleValueType();
5758 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5759 "Unsupported vector type for broadcast.");
5764 switch (Op.getOpcode()) {
5766 // Unknown pattern found.
5769 case ISD::BUILD_VECTOR: {
5770 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
5771 BitVector UndefElements;
5772 SDValue Splat = BVOp->getSplatValue(&UndefElements);
5774 // We need a splat of a single value to use broadcast, and it doesn't
5775 // make any sense if the value is only in one element of the vector.
5776 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
5780 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5781 Ld.getOpcode() == ISD::ConstantFP);
5783 // Make sure that all of the users of a non-constant load are from the
5784 // BUILD_VECTOR node.
5785 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
5790 case ISD::VECTOR_SHUFFLE: {
5791 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5793 // Shuffles must have a splat mask where the first element is
5795 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5798 SDValue Sc = Op.getOperand(0);
5799 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5800 Sc.getOpcode() != ISD::BUILD_VECTOR) {
5802 if (!Subtarget.hasInt256())
5805 // Use the register form of the broadcast instruction available on AVX2.
5806 if (VT.getSizeInBits() >= 256)
5807 Sc = extract128BitVector(Sc, 0, DAG, dl);
5808 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5811 Ld = Sc.getOperand(0);
5812 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5813 Ld.getOpcode() == ISD::ConstantFP);
5815 // The scalar_to_vector node and the suspected
5816 // load node must have exactly one user.
5817 // Constants may have multiple users.
5819 // AVX-512 has register version of the broadcast
5820 bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
5821 Ld.getValueType().getSizeInBits() >= 32;
5822 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5829 unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5830 bool IsGE256 = (VT.getSizeInBits() >= 256);
5832 // When optimizing for size, generate up to 5 extra bytes for a broadcast
5833 // instruction to save 8 or more bytes of constant pool data.
5834 // TODO: If multiple splats are generated to load the same constant,
5835 // it may be detrimental to overall size. There needs to be a way to detect
5836 // that condition to know if this is truly a size win.
5837 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
5839 // Handle broadcasting a single constant scalar from the constant pool
5841 // On Sandybridge (no AVX2), it is still better to load a constant vector
5842 // from the constant pool and not to broadcast it from a scalar.
5843 // But override that restriction when optimizing for size.
5844 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
5845 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
5846 EVT CVT = Ld.getValueType();
5847 assert(!CVT.isVector() && "Must not broadcast a vector type");
5849 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
5850 // For size optimization, also splat v2f64 and v2i64, and for size opt
5851 // with AVX2, also splat i8 and i16.
5852 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
5853 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5854 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
5855 const Constant *C = nullptr;
5856 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5857 C = CI->getConstantIntValue();
5858 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5859 C = CF->getConstantFPValue();
5861 assert(C && "Invalid constant type");
5863 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5865 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
5866 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5868 CVT, dl, DAG.getEntryNode(), CP,
5869 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
5872 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5876 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5878 // Handle AVX2 in-register broadcasts.
5879 if (!IsLoad && Subtarget.hasInt256() &&
5880 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5881 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5883 // The scalar source must be a normal load.
5887 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5888 (Subtarget.hasVLX() && ScalarSize == 64))
5889 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5891 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5892 // double since there is no vbroadcastsd xmm
5893 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
5894 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5895 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5898 // Unsupported broadcast.
5902 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
5903 /// underlying vector and index.
5905 /// Modifies \p ExtractedFromVec to the real vector and returns the real
5907 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
5909 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5910 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
5913 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
5915 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
5917 // (extract_vector_elt (vector_shuffle<2,u,u,u>
5918 // (extract_subvector (v8f32 %vreg0), Constant<4>),
5921 // In this case the vector is the extract_subvector expression and the index
5922 // is 2, as specified by the shuffle.
5923 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
5924 SDValue ShuffleVec = SVOp->getOperand(0);
5925 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
5926 assert(ShuffleVecVT.getVectorElementType() ==
5927 ExtractedFromVec.getSimpleValueType().getVectorElementType());
5929 int ShuffleIdx = SVOp->getMaskElt(Idx);
5930 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
5931 ExtractedFromVec = ShuffleVec;
5937 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
5938 MVT VT = Op.getSimpleValueType();
5940 // Skip if insert_vec_elt is not supported.
5941 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5942 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5946 unsigned NumElems = Op.getNumOperands();
5950 SmallVector<unsigned, 4> InsertIndices;
5951 SmallVector<int, 8> Mask(NumElems, -1);
5953 for (unsigned i = 0; i != NumElems; ++i) {
5954 unsigned Opc = Op.getOperand(i).getOpcode();
5956 if (Opc == ISD::UNDEF)
5959 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5960 // Quit if more than 1 elements need inserting.
5961 if (InsertIndices.size() > 1)
5964 InsertIndices.push_back(i);
5968 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5969 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5970 // Quit if non-constant index.
5971 if (!isa<ConstantSDNode>(ExtIdx))
5973 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
5975 // Quit if extracted from vector of different type.
5976 if (ExtractedFromVec.getValueType() != VT)
5979 if (!VecIn1.getNode())
5980 VecIn1 = ExtractedFromVec;
5981 else if (VecIn1 != ExtractedFromVec) {
5982 if (!VecIn2.getNode())
5983 VecIn2 = ExtractedFromVec;
5984 else if (VecIn2 != ExtractedFromVec)
5985 // Quit if more than 2 vectors to shuffle
5989 if (ExtractedFromVec == VecIn1)
5991 else if (ExtractedFromVec == VecIn2)
5992 Mask[i] = Idx + NumElems;
5995 if (!VecIn1.getNode())
5998 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5999 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6000 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6001 unsigned Idx = InsertIndices[i];
6002 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6003 DAG.getIntPtrConstant(Idx, DL));
6009 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6010 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6011 Op.getScalarValueSizeInBits() == 1 &&
6012 "Can not convert non-constant vector");
6013 uint64_t Immediate = 0;
6014 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6015 SDValue In = Op.getOperand(idx);
6017 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6021 MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
6022 return DAG.getConstant(Immediate, dl, VT);
6024 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6026 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6028 MVT VT = Op.getSimpleValueType();
6029 assert((VT.getVectorElementType() == MVT::i1) &&
6030 "Unexpected type in LowerBUILD_VECTORvXi1!");
6033 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6034 return DAG.getTargetConstant(0, dl, VT);
6036 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6037 return DAG.getTargetConstant(1, dl, VT);
6039 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6040 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6041 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6042 return DAG.getBitcast(VT, Imm);
6043 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6044 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6045 DAG.getIntPtrConstant(0, dl));
6048 // Vector has one or more non-const elements
6049 uint64_t Immediate = 0;
6050 SmallVector<unsigned, 16> NonConstIdx;
6051 bool IsSplat = true;
6052 bool HasConstElts = false;
6054 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6055 SDValue In = Op.getOperand(idx);
6058 if (!isa<ConstantSDNode>(In))
6059 NonConstIdx.push_back(idx);
6061 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6062 HasConstElts = true;
6066 else if (In != Op.getOperand(SplatIdx))
6070 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6072 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6073 DAG.getConstant(1, dl, VT),
6074 DAG.getConstant(0, dl, VT));
6076 // insert elements one by one
6080 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6081 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6083 else if (HasConstElts)
6084 Imm = DAG.getConstant(0, dl, VT);
6086 Imm = DAG.getUNDEF(VT);
6087 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6088 DstVec = DAG.getBitcast(VT, Imm);
6090 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6091 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6092 DAG.getIntPtrConstant(0, dl));
6095 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6096 unsigned InsertIdx = NonConstIdx[i];
6097 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6098 Op.getOperand(InsertIdx),
6099 DAG.getIntPtrConstant(InsertIdx, dl));
6104 /// \brief Return true if \p N implements a horizontal binop and return the
6105 /// operands for the horizontal binop into V0 and V1.
6107 /// This is a helper function of LowerToHorizontalOp().
6108 /// This function checks that the build_vector \p N in input implements a
6109 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6110 /// operation to match.
6111 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6112 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6113 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6116 /// This function only analyzes elements of \p N whose indices are
6117 /// in range [BaseIdx, LastIdx).
6118 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6120 unsigned BaseIdx, unsigned LastIdx,
6121 SDValue &V0, SDValue &V1) {
6122 EVT VT = N->getValueType(0);
6124 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6125 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6126 "Invalid Vector in input!");
6128 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6129 bool CanFold = true;
6130 unsigned ExpectedVExtractIdx = BaseIdx;
6131 unsigned NumElts = LastIdx - BaseIdx;
6132 V0 = DAG.getUNDEF(VT);
6133 V1 = DAG.getUNDEF(VT);
6135 // Check if N implements a horizontal binop.
6136 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6137 SDValue Op = N->getOperand(i + BaseIdx);
6140 if (Op->isUndef()) {
6141 // Update the expected vector extract index.
6142 if (i * 2 == NumElts)
6143 ExpectedVExtractIdx = BaseIdx;
6144 ExpectedVExtractIdx += 2;
6148 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6153 SDValue Op0 = Op.getOperand(0);
6154 SDValue Op1 = Op.getOperand(1);
6156 // Try to match the following pattern:
6157 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6158 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6159 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6160 Op0.getOperand(0) == Op1.getOperand(0) &&
6161 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6162 isa<ConstantSDNode>(Op1.getOperand(1)));
6166 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6167 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6169 if (i * 2 < NumElts) {
6171 V0 = Op0.getOperand(0);
6172 if (V0.getValueType() != VT)
6177 V1 = Op0.getOperand(0);
6178 if (V1.getValueType() != VT)
6181 if (i * 2 == NumElts)
6182 ExpectedVExtractIdx = BaseIdx;
6185 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6186 if (I0 == ExpectedVExtractIdx)
6187 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6188 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6189 // Try to match the following dag sequence:
6190 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6191 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6195 ExpectedVExtractIdx += 2;
6201 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6202 /// a concat_vector.
6204 /// This is a helper function of LowerToHorizontalOp().
6205 /// This function expects two 256-bit vectors called V0 and V1.
6206 /// At first, each vector is split into two separate 128-bit vectors.
6207 /// Then, the resulting 128-bit vectors are used to implement two
6208 /// horizontal binary operations.
6210 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6212 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6213 /// the two new horizontal binop.
6214 /// When Mode is set, the first horizontal binop dag node would take as input
6215 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6216 /// horizontal binop dag node would take as input the lower 128-bit of V1
6217 /// and the upper 128-bit of V1.
6219 /// HADD V0_LO, V0_HI
6220 /// HADD V1_LO, V1_HI
6222 /// Otherwise, the first horizontal binop dag node takes as input the lower
6223 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6224 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6226 /// HADD V0_LO, V1_LO
6227 /// HADD V0_HI, V1_HI
6229 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6230 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6231 /// the upper 128-bits of the result.
6232 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6233 const SDLoc &DL, SelectionDAG &DAG,
6234 unsigned X86Opcode, bool Mode,
6235 bool isUndefLO, bool isUndefHI) {
6236 MVT VT = V0.getSimpleValueType();
6237 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6238 "Invalid nodes in input!");
6240 unsigned NumElts = VT.getVectorNumElements();
6241 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6242 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6243 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6244 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6245 MVT NewVT = V0_LO.getSimpleValueType();
6247 SDValue LO = DAG.getUNDEF(NewVT);
6248 SDValue HI = DAG.getUNDEF(NewVT);
6251 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6252 if (!isUndefLO && !V0->isUndef())
6253 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6254 if (!isUndefHI && !V1->isUndef())
6255 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6257 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6258 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6259 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6261 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6262 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6265 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6268 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6270 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6271 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6272 MVT VT = BV->getSimpleValueType(0);
6273 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6274 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6278 unsigned NumElts = VT.getVectorNumElements();
6279 SDValue InVec0 = DAG.getUNDEF(VT);
6280 SDValue InVec1 = DAG.getUNDEF(VT);
6282 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6283 VT == MVT::v2f64) && "build_vector with an invalid type found!");
6285 // Odd-numbered elements in the input build vector are obtained from
6286 // adding two integer/float elements.
6287 // Even-numbered elements in the input build vector are obtained from
6288 // subtracting two integer/float elements.
6289 unsigned ExpectedOpcode = ISD::FSUB;
6290 unsigned NextExpectedOpcode = ISD::FADD;
6291 bool AddFound = false;
6292 bool SubFound = false;
6294 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6295 SDValue Op = BV->getOperand(i);
6297 // Skip 'undef' values.
6298 unsigned Opcode = Op.getOpcode();
6299 if (Opcode == ISD::UNDEF) {
6300 std::swap(ExpectedOpcode, NextExpectedOpcode);
6304 // Early exit if we found an unexpected opcode.
6305 if (Opcode != ExpectedOpcode)
6308 SDValue Op0 = Op.getOperand(0);
6309 SDValue Op1 = Op.getOperand(1);
6311 // Try to match the following pattern:
6312 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6313 // Early exit if we cannot match that sequence.
6314 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6315 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6316 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6317 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6318 Op0.getOperand(1) != Op1.getOperand(1))
6321 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6325 // We found a valid add/sub node. Update the information accordingly.
6331 // Update InVec0 and InVec1.
6332 if (InVec0.isUndef()) {
6333 InVec0 = Op0.getOperand(0);
6334 if (InVec0.getSimpleValueType() != VT)
6337 if (InVec1.isUndef()) {
6338 InVec1 = Op1.getOperand(0);
6339 if (InVec1.getSimpleValueType() != VT)
6343 // Make sure that operands in input to each add/sub node always
6344 // come from a same pair of vectors.
6345 if (InVec0 != Op0.getOperand(0)) {
6346 if (ExpectedOpcode == ISD::FSUB)
6349 // FADD is commutable. Try to commute the operands
6350 // and then test again.
6351 std::swap(Op0, Op1);
6352 if (InVec0 != Op0.getOperand(0))
6356 if (InVec1 != Op1.getOperand(0))
6359 // Update the pair of expected opcodes.
6360 std::swap(ExpectedOpcode, NextExpectedOpcode);
6363 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6364 if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
6365 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6370 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
6371 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
6372 const X86Subtarget &Subtarget,
6373 SelectionDAG &DAG) {
6374 MVT VT = BV->getSimpleValueType(0);
6375 unsigned NumElts = VT.getVectorNumElements();
6376 unsigned NumUndefsLO = 0;
6377 unsigned NumUndefsHI = 0;
6378 unsigned Half = NumElts/2;
6380 // Count the number of UNDEF operands in the build_vector in input.
6381 for (unsigned i = 0, e = Half; i != e; ++i)
6382 if (BV->getOperand(i)->isUndef())
6385 for (unsigned i = Half, e = NumElts; i != e; ++i)
6386 if (BV->getOperand(i)->isUndef())
6389 // Early exit if this is either a build_vector of all UNDEFs or all the
6390 // operands but one are UNDEF.
6391 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6395 SDValue InVec0, InVec1;
6396 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
6397 // Try to match an SSE3 float HADD/HSUB.
6398 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6399 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6401 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6402 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6403 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
6404 // Try to match an SSSE3 integer HADD/HSUB.
6405 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6406 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6408 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6409 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6412 if (!Subtarget.hasAVX())
6415 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6416 // Try to match an AVX horizontal add/sub of packed single/double
6417 // precision floating point values from 256-bit vectors.
6418 SDValue InVec2, InVec3;
6419 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6420 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6421 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6422 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6423 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6425 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6426 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6427 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6428 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6429 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6430 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6431 // Try to match an AVX2 horizontal add/sub of signed integers.
6432 SDValue InVec2, InVec3;
6434 bool CanFold = true;
6436 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6437 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6438 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6439 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6440 X86Opcode = X86ISD::HADD;
6441 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6442 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6443 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6444 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6445 X86Opcode = X86ISD::HSUB;
6450 // Fold this build_vector into a single horizontal add/sub.
6451 // Do this only if the target has AVX2.
6452 if (Subtarget.hasAVX2())
6453 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6455 // Do not try to expand this build_vector into a pair of horizontal
6456 // add/sub if we can emit a pair of scalar add/sub.
6457 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6460 // Convert this build_vector into a pair of horizontal binop followed by
6462 bool isUndefLO = NumUndefsLO == Half;
6463 bool isUndefHI = NumUndefsHI == Half;
6464 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6465 isUndefLO, isUndefHI);
6469 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6470 VT == MVT::v16i16) && Subtarget.hasAVX()) {
6472 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6473 X86Opcode = X86ISD::HADD;
6474 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6475 X86Opcode = X86ISD::HSUB;
6476 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6477 X86Opcode = X86ISD::FHADD;
6478 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6479 X86Opcode = X86ISD::FHSUB;
6483 // Don't try to expand this build_vector into a pair of horizontal add/sub
6484 // if we can simply emit a pair of scalar add/sub.
6485 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6488 // Convert this build_vector into two horizontal add/sub followed by
6490 bool isUndefLO = NumUndefsLO == Half;
6491 bool isUndefHI = NumUndefsHI == Half;
6492 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6493 isUndefLO, isUndefHI);
6499 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
6500 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
6501 /// just apply the bit to the vectors.
6502 /// NOTE: Its not in our interest to start make a general purpose vectorizer
6503 /// from this, but enough scalar bit operations are created from the later
6504 /// legalization + scalarization stages to need basic support.
6505 static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
6507 MVT VT = Op.getSimpleValueType();
6508 unsigned NumElems = VT.getVectorNumElements();
6509 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6511 // Check that all elements have the same opcode.
6512 // TODO: Should we allow UNDEFS and if so how many?
6513 unsigned Opcode = Op.getOperand(0).getOpcode();
6514 for (unsigned i = 1; i < NumElems; ++i)
6515 if (Opcode != Op.getOperand(i).getOpcode())
6518 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
6525 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
6530 SmallVector<SDValue, 4> LHSElts, RHSElts;
6531 for (SDValue Elt : Op->ops()) {
6532 SDValue LHS = Elt.getOperand(0);
6533 SDValue RHS = Elt.getOperand(1);
6535 // We expect the canonicalized RHS operand to be the constant.
6536 if (!isa<ConstantSDNode>(RHS))
6538 LHSElts.push_back(LHS);
6539 RHSElts.push_back(RHS);
6542 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
6543 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
6544 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
6547 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
6548 /// functionality to do this, so it's all zeros, all ones, or some derivation
6549 /// that is cheap to calculate.
6550 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
6551 const X86Subtarget &Subtarget) {
6553 MVT VT = Op.getSimpleValueType();
6555 // Vectors containing all zeros can be matched by pxor and xorps.
6556 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6557 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6558 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6559 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6562 return getZeroVector(VT, Subtarget, DAG, DL);
6565 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6566 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6567 // vpcmpeqd on 256-bit vectors.
6568 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6569 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
6570 (VT == MVT::v8i32 && Subtarget.hasInt256()))
6573 return getOnesVector(VT, Subtarget, DAG, DL);
6580 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6583 MVT VT = Op.getSimpleValueType();
6584 MVT ExtVT = VT.getVectorElementType();
6585 unsigned NumElems = Op.getNumOperands();
6587 // Generate vectors for predicate vectors.
6588 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
6589 return LowerBUILD_VECTORvXi1(Op, DAG);
6591 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
6592 return VectorConstant;
6594 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
6595 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
6597 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
6598 return HorizontalOp;
6599 if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
6601 if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
6604 unsigned EVTBits = ExtVT.getSizeInBits();
6606 unsigned NumZero = 0;
6607 unsigned NumNonZero = 0;
6608 uint64_t NonZeros = 0;
6609 bool IsAllConstants = true;
6610 SmallSet<SDValue, 8> Values;
6611 for (unsigned i = 0; i < NumElems; ++i) {
6612 SDValue Elt = Op.getOperand(i);
6616 if (Elt.getOpcode() != ISD::Constant &&
6617 Elt.getOpcode() != ISD::ConstantFP)
6618 IsAllConstants = false;
6619 if (X86::isZeroNode(Elt))
6622 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
6623 NonZeros |= ((uint64_t)1 << i);
6628 // All undef vector. Return an UNDEF. All zero vectors were handled above.
6629 if (NumNonZero == 0)
6630 return DAG.getUNDEF(VT);
6632 // Special case for single non-zero, non-undef, element.
6633 if (NumNonZero == 1) {
6634 unsigned Idx = countTrailingZeros(NonZeros);
6635 SDValue Item = Op.getOperand(Idx);
6637 // If this is an insertion of an i64 value on x86-32, and if the top bits of
6638 // the value are obviously zero, truncate the value to i32 and do the
6639 // insertion that way. Only do this if the value is non-constant or if the
6640 // value is a constant being inserted into element 0. It is cheaper to do
6641 // a constant pool load than it is to do a movd + shuffle.
6642 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
6643 (!IsAllConstants || Idx == 0)) {
6644 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6646 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6647 MVT VecVT = MVT::v4i32;
6649 // Truncate the value (which may itself be a constant) to i32, and
6650 // convert it to a vector with movd (S2V+shuffle to zero extend).
6651 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6652 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6653 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
6654 Item, Idx * 2, true, Subtarget, DAG));
6658 // If we have a constant or non-constant insertion into the low element of
6659 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6660 // the rest of the elements. This will be matched as movd/movq/movss/movsd
6661 // depending on what the source datatype is.
6664 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6666 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6667 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
6668 if (VT.is512BitVector()) {
6669 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6670 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6671 Item, DAG.getIntPtrConstant(0, dl));
6673 assert((VT.is128BitVector() || VT.is256BitVector()) &&
6674 "Expected an SSE value type!");
6675 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6676 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6677 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6680 // We can't directly insert an i8 or i16 into a vector, so zero extend
6682 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6683 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
6684 if (VT.getSizeInBits() >= 256) {
6685 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
6686 if (Subtarget.hasAVX()) {
6687 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
6688 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6690 // Without AVX, we need to extend to a 128-bit vector and then
6691 // insert into the 256-bit vector.
6692 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6693 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
6694 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
6697 assert(VT.is128BitVector() && "Expected an SSE value type!");
6698 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6699 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6701 return DAG.getBitcast(VT, Item);
6705 // Is it a vector logical left shift?
6706 if (NumElems == 2 && Idx == 1 &&
6707 X86::isZeroNode(Op.getOperand(0)) &&
6708 !X86::isZeroNode(Op.getOperand(1))) {
6709 unsigned NumBits = VT.getSizeInBits();
6710 return getVShift(true, VT,
6711 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6712 VT, Op.getOperand(1)),
6713 NumBits/2, DAG, *this, dl);
6716 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
6719 // Otherwise, if this is a vector with i32 or f32 elements, and the element
6720 // is a non-constant being inserted into an element other than the low one,
6721 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
6722 // movd/movss) to move this into the low element, then shuffle it into
6724 if (EVTBits == 32) {
6725 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6726 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
6730 // Splat is obviously ok. Let legalizer expand it to a shuffle.
6731 if (Values.size() == 1) {
6732 if (EVTBits == 32) {
6733 // Instead of a shuffle like this:
6734 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
6735 // Check if it's possible to issue this instead.
6736 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
6737 unsigned Idx = countTrailingZeros(NonZeros);
6738 SDValue Item = Op.getOperand(Idx);
6739 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
6740 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
6745 // A vector full of immediates; various special cases are already
6746 // handled, so this is best done with a single constant-pool load.
6750 // See if we can use a vector load to get all of the elements.
6751 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
6752 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6753 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
6757 // For AVX-length vectors, build the individual 128-bit pieces and use
6758 // shuffles to put them in place.
6759 if (VT.is256BitVector() || VT.is512BitVector()) {
6760 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6762 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
6764 // Build both the lower and upper subvector.
6766 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
6767 SDValue Upper = DAG.getBuildVector(
6768 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
6770 // Recreate the wider vector with the lower and upper part.
6771 if (VT.is256BitVector())
6772 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6773 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6776 // Let legalizer expand 2-wide build_vectors.
6777 if (EVTBits == 64) {
6778 if (NumNonZero == 1) {
6779 // One half is zero or undef.
6780 unsigned Idx = countTrailingZeros(NonZeros);
6781 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
6782 Op.getOperand(Idx));
6783 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
6788 // If element VT is < 32 bits, convert it to inserts into a zero vector.
6789 if (EVTBits == 8 && NumElems == 16)
6790 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
6791 DAG, Subtarget, *this))
6794 if (EVTBits == 16 && NumElems == 8)
6795 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
6796 DAG, Subtarget, *this))
6799 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
6800 if (EVTBits == 32 && NumElems == 4)
6801 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
6804 // If element VT is == 32 bits, turn it into a number of shuffles.
6805 if (NumElems == 4 && NumZero > 0) {
6806 SmallVector<SDValue, 8> Ops(NumElems);
6807 for (unsigned i = 0; i < 4; ++i) {
6808 bool isZero = !(NonZeros & (1ULL << i));
6810 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
6812 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6815 for (unsigned i = 0; i < 2; ++i) {
6816 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
6819 Ops[i] = Ops[i*2]; // Must be a zero vector.
6822 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
6825 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6828 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6833 bool Reverse1 = (NonZeros & 0x3) == 2;
6834 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
6838 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6839 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
6841 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
6844 if (Values.size() > 1 && VT.is128BitVector()) {
6845 // Check for a build vector from mostly shuffle plus few inserting.
6846 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
6849 // For SSE 4.1, use insertps to put the high elements into the low element.
6850 if (Subtarget.hasSSE41()) {
6852 if (!Op.getOperand(0).isUndef())
6853 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6855 Result = DAG.getUNDEF(VT);
6857 for (unsigned i = 1; i < NumElems; ++i) {
6858 if (Op.getOperand(i).isUndef()) continue;
6859 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6860 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6865 // Otherwise, expand into a number of unpckl*, start by extending each of
6866 // our (non-undef) elements to the full vector width with the element in the
6867 // bottom slot of the vector (which generates no code for SSE).
6868 SmallVector<SDValue, 8> Ops(NumElems);
6869 for (unsigned i = 0; i < NumElems; ++i) {
6870 if (!Op.getOperand(i).isUndef())
6871 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6873 Ops[i] = DAG.getUNDEF(VT);
6876 // Next, we iteratively mix elements, e.g. for v4f32:
6877 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6878 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6879 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
6880 unsigned EltStride = NumElems >> 1;
6881 while (EltStride != 0) {
6882 for (unsigned i = 0; i < EltStride; ++i) {
6883 // If Ops[i+EltStride] is undef and this is the first round of mixing,
6884 // then it is safe to just drop this shuffle: V[i] is already in the
6885 // right place, the one element (since it's the first round) being
6886 // inserted as undef can be dropped. This isn't safe for successive
6887 // rounds because they will permute elements within both vectors.
6888 if (Ops[i+EltStride].isUndef() &&
6889 EltStride == NumElems/2)
6892 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
6901 // 256-bit AVX can use the vinsertf128 instruction
6902 // to create 256-bit vectors from two other 128-bit ones.
6903 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6905 MVT ResVT = Op.getSimpleValueType();
6907 assert((ResVT.is256BitVector() ||
6908 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6910 SDValue V1 = Op.getOperand(0);
6911 SDValue V2 = Op.getOperand(1);
6912 unsigned NumElems = ResVT.getVectorNumElements();
6913 if (ResVT.is256BitVector())
6914 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6916 if (Op.getNumOperands() == 4) {
6917 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6918 ResVT.getVectorNumElements()/2);
6919 SDValue V3 = Op.getOperand(2);
6920 SDValue V4 = Op.getOperand(3);
6921 return concat256BitVectors(
6922 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
6923 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
6926 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6929 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
6930 const X86Subtarget &Subtarget,
6931 SelectionDAG & DAG) {
6933 MVT ResVT = Op.getSimpleValueType();
6934 unsigned NumOfOperands = Op.getNumOperands();
6936 assert(isPowerOf2_32(NumOfOperands) &&
6937 "Unexpected number of operands in CONCAT_VECTORS");
6939 SDValue Undef = DAG.getUNDEF(ResVT);
6940 if (NumOfOperands > 2) {
6941 // Specialize the cases when all, or all but one, of the operands are undef.
6942 unsigned NumOfDefinedOps = 0;
6944 for (unsigned i = 0; i < NumOfOperands; i++)
6945 if (!Op.getOperand(i).isUndef()) {
6949 if (NumOfDefinedOps == 0)
6951 if (NumOfDefinedOps == 1) {
6952 unsigned SubVecNumElts =
6953 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
6954 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
6955 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
6956 Op.getOperand(OpIdx), IdxVal);
6959 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6960 ResVT.getVectorNumElements()/2);
6961 SmallVector<SDValue, 2> Ops;
6962 for (unsigned i = 0; i < NumOfOperands/2; i++)
6963 Ops.push_back(Op.getOperand(i));
6964 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6966 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
6967 Ops.push_back(Op.getOperand(i));
6968 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6969 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
6973 SDValue V1 = Op.getOperand(0);
6974 SDValue V2 = Op.getOperand(1);
6975 unsigned NumElems = ResVT.getVectorNumElements();
6976 assert(V1.getValueType() == V2.getValueType() &&
6977 V1.getValueType().getVectorNumElements() == NumElems/2 &&
6978 "Unexpected operands in CONCAT_VECTORS");
6980 if (ResVT.getSizeInBits() >= 16)
6981 return Op; // The operation is legal with KUNPCK
6983 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
6984 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
6985 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
6986 if (IsZeroV1 && IsZeroV2)
6989 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6991 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
6993 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
6995 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
6997 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7000 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7002 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7003 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7006 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7007 const X86Subtarget &Subtarget,
7008 SelectionDAG &DAG) {
7009 MVT VT = Op.getSimpleValueType();
7010 if (VT.getVectorElementType() == MVT::i1)
7011 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7013 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7014 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7015 Op.getNumOperands() == 4)));
7017 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7018 // from two other 128-bit ones.
7020 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7021 return LowerAVXCONCAT_VECTORS(Op, DAG);
7024 //===----------------------------------------------------------------------===//
7025 // Vector shuffle lowering
7027 // This is an experimental code path for lowering vector shuffles on x86. It is
7028 // designed to handle arbitrary vector shuffles and blends, gracefully
7029 // degrading performance as necessary. It works hard to recognize idiomatic
7030 // shuffles and lower them to optimal instruction patterns without leaving
7031 // a framework that allows reasonably efficient handling of all vector shuffle
7033 //===----------------------------------------------------------------------===//
7035 /// \brief Tiny helper function to identify a no-op mask.
7037 /// This is a somewhat boring predicate function. It checks whether the mask
7038 /// array input, which is assumed to be a single-input shuffle mask of the kind
7039 /// used by the X86 shuffle instructions (not a fully general
7040 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7041 /// in-place shuffle are 'no-op's.
7042 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7043 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7044 assert(Mask[i] >= -1 && "Out of bound mask element!");
7045 if (Mask[i] >= 0 && Mask[i] != i)
7051 /// \brief Test whether there are elements crossing 128-bit lanes in this
7054 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7055 /// and we routinely test for these.
7056 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7057 int LaneSize = 128 / VT.getScalarSizeInBits();
7058 int Size = Mask.size();
7059 for (int i = 0; i < Size; ++i)
7060 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7065 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7067 /// This checks a shuffle mask to see if it is performing the same
7068 /// lane-relative shuffle in each sub-lane. This trivially implies
7069 /// that it is also not lane-crossing. It may however involve a blend from the
7070 /// same lane of a second vector.
7072 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7073 /// non-trivial to compute in the face of undef lanes. The representation is
7074 /// suitable for use with existing 128-bit shuffles as entries from the second
7075 /// vector have been remapped to [LaneSize, 2*LaneSize).
7076 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7078 SmallVectorImpl<int> &RepeatedMask) {
7079 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7080 RepeatedMask.assign(LaneSize, -1);
7081 int Size = Mask.size();
7082 for (int i = 0; i < Size; ++i) {
7085 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7086 // This entry crosses lanes, so there is no way to model this shuffle.
7089 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7090 // Adjust second vector indices to start at LaneSize instead of Size.
7091 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7092 : Mask[i] % LaneSize + LaneSize;
7093 if (RepeatedMask[i % LaneSize] < 0)
7094 // This is the first non-undef entry in this slot of a 128-bit lane.
7095 RepeatedMask[i % LaneSize] = LocalM;
7096 else if (RepeatedMask[i % LaneSize] != LocalM)
7097 // Found a mismatch with the repeated mask.
7103 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7105 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7106 SmallVectorImpl<int> &RepeatedMask) {
7107 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7110 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7112 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7113 SmallVectorImpl<int> &RepeatedMask) {
7114 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7117 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
7118 SmallVectorImpl<int> &ScaledMask) {
7119 assert(0 < Scale && "Unexpected scaling factor");
7120 int NumElts = Mask.size();
7121 ScaledMask.assign(NumElts * Scale, -1);
7123 for (int i = 0; i != NumElts; ++i) {
7126 // Repeat sentinel values in every mask element.
7128 for (int s = 0; s != Scale; ++s)
7129 ScaledMask[(Scale * i) + s] = M;
7133 // Scale mask element and increment across each mask element.
7134 for (int s = 0; s != Scale; ++s)
7135 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
7139 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7142 /// This is a fast way to test a shuffle mask against a fixed pattern:
7144 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7146 /// It returns true if the mask is exactly as wide as the argument list, and
7147 /// each element of the mask is either -1 (signifying undef) or the value given
7148 /// in the argument.
7149 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7150 ArrayRef<int> ExpectedMask) {
7151 if (Mask.size() != ExpectedMask.size())
7154 int Size = Mask.size();
7156 // If the values are build vectors, we can look through them to find
7157 // equivalent inputs that make the shuffles equivalent.
7158 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7159 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7161 for (int i = 0; i < Size; ++i) {
7162 assert(Mask[i] >= -1 && "Out of bound mask element!");
7163 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7164 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7165 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7166 if (!MaskBV || !ExpectedBV ||
7167 MaskBV->getOperand(Mask[i] % Size) !=
7168 ExpectedBV->getOperand(ExpectedMask[i] % Size))
7176 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7178 /// The masks must be exactly the same width.
7180 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7181 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7183 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
7184 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7185 ArrayRef<int> ExpectedMask) {
7186 int Size = Mask.size();
7187 if (Size != (int)ExpectedMask.size())
7190 for (int i = 0; i < Size; ++i)
7191 if (Mask[i] == SM_SentinelUndef)
7193 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7195 else if (Mask[i] != ExpectedMask[i])
7201 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7203 /// This helper function produces an 8-bit shuffle immediate corresponding to
7204 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7205 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7208 /// NB: We rely heavily on "undef" masks preserving the input lane.
7209 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7210 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7211 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7212 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7213 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7214 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7217 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7218 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7219 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7220 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7224 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7225 SelectionDAG &DAG) {
7226 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7229 /// \brief Compute whether each element of a shuffle is zeroable.
7231 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7232 /// Either it is an undef element in the shuffle mask, the element of the input
7233 /// referenced is undef, or the element of the input referenced is known to be
7234 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7235 /// as many lanes with this technique as possible to simplify the remaining
7237 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7238 SDValue V1, SDValue V2) {
7239 SmallBitVector Zeroable(Mask.size(), false);
7240 V1 = peekThroughBitcasts(V1);
7241 V2 = peekThroughBitcasts(V2);
7243 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7244 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7246 int VectorSizeInBits = V1.getValueType().getSizeInBits();
7247 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7248 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7250 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7252 // Handle the easy cases.
7253 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7258 // Determine shuffle input and normalize the mask.
7259 SDValue V = M < Size ? V1 : V2;
7262 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7263 if (V.getOpcode() != ISD::BUILD_VECTOR)
7266 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7267 // the (larger) source element must be UNDEF/ZERO.
7268 if ((Size % V.getNumOperands()) == 0) {
7269 int Scale = Size / V->getNumOperands();
7270 SDValue Op = V.getOperand(M / Scale);
7271 if (Op.isUndef() || X86::isZeroNode(Op))
7273 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7274 APInt Val = Cst->getAPIntValue();
7275 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7276 Val = Val.getLoBits(ScalarSizeInBits);
7277 Zeroable[i] = (Val == 0);
7278 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7279 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7280 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7281 Val = Val.getLoBits(ScalarSizeInBits);
7282 Zeroable[i] = (Val == 0);
7287 // If the BUILD_VECTOR has more elements then all the (smaller) source
7288 // elements must be UNDEF or ZERO.
7289 if ((V.getNumOperands() % Size) == 0) {
7290 int Scale = V->getNumOperands() / Size;
7291 bool AllZeroable = true;
7292 for (int j = 0; j < Scale; ++j) {
7293 SDValue Op = V.getOperand((M * Scale) + j);
7294 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
7296 Zeroable[i] = AllZeroable;
7304 /// Try to lower a shuffle with a single PSHUFB of V1.
7305 /// This is only possible if V2 is unused (at all, or only for zero elements).
7306 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
7307 ArrayRef<int> Mask, SDValue V1,
7309 const X86Subtarget &Subtarget,
7310 SelectionDAG &DAG) {
7311 int Size = Mask.size();
7312 int LaneSize = 128 / VT.getScalarSizeInBits();
7313 const int NumBytes = VT.getSizeInBits() / 8;
7314 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
7316 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
7317 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
7318 (Subtarget.hasBWI() && VT.is512BitVector()));
7320 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7322 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
7323 // Sign bit set in i8 mask means zero element.
7324 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
7326 for (int i = 0; i < NumBytes; ++i) {
7327 int M = Mask[i / NumEltBytes];
7329 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
7332 if (Zeroable[i / NumEltBytes]) {
7333 PSHUFBMask[i] = ZeroMask;
7340 // PSHUFB can't cross lanes, ensure this doesn't happen.
7341 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
7345 M = M * NumEltBytes + (i % NumEltBytes);
7346 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
7349 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
7350 return DAG.getBitcast(
7351 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
7352 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
7355 // X86 has dedicated unpack instructions that can handle specific blend
7356 // operations: UNPCKH and UNPCKL.
7357 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
7358 ArrayRef<int> Mask, SDValue V1,
7359 SDValue V2, SelectionDAG &DAG) {
7360 int NumElts = VT.getVectorNumElements();
7361 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7362 SmallVector<int, 8> Unpckl(NumElts);
7363 SmallVector<int, 8> Unpckh(NumElts);
7365 for (int i = 0; i < NumElts; ++i) {
7366 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7367 int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
7368 int HiPos = LoPos + NumEltsInLane / 2;
7373 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7374 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
7375 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7376 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
7378 // Commute and try again.
7379 ShuffleVectorSDNode::commuteMask(Unpckl);
7380 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7381 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
7383 ShuffleVectorSDNode::commuteMask(Unpckh);
7384 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7385 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
7390 /// \brief Try to emit a bitmask instruction for a shuffle.
7392 /// This handles cases where we can model a blend exactly as a bitmask due to
7393 /// one of the inputs being zeroable.
7394 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
7395 SDValue V2, ArrayRef<int> Mask,
7396 SelectionDAG &DAG) {
7397 MVT EltVT = VT.getVectorElementType();
7398 int NumEltBits = EltVT.getSizeInBits();
7399 MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7400 SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
7401 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7403 if (EltVT.isFloatingPoint()) {
7404 Zero = DAG.getBitcast(EltVT, Zero);
7405 AllOnes = DAG.getBitcast(EltVT, AllOnes);
7407 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7408 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7410 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7413 if (Mask[i] % Size != i)
7414 return SDValue(); // Not a blend.
7416 V = Mask[i] < Size ? V1 : V2;
7417 else if (V != (Mask[i] < Size ? V1 : V2))
7418 return SDValue(); // Can only let one input through the mask.
7420 VMaskOps[i] = AllOnes;
7423 return SDValue(); // No non-zeroable elements!
7425 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
7426 V = DAG.getNode(VT.isFloatingPoint()
7427 ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7432 /// \brief Try to emit a blend instruction for a shuffle using bit math.
7434 /// This is used as a fallback approach when first class blend instructions are
7435 /// unavailable. Currently it is only suitable for integer vectors, but could
7436 /// be generalized for floating point vectors if desirable.
7437 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
7438 SDValue V2, ArrayRef<int> Mask,
7439 SelectionDAG &DAG) {
7440 assert(VT.isInteger() && "Only supports integer vector types!");
7441 MVT EltVT = VT.getVectorElementType();
7442 int NumEltBits = EltVT.getSizeInBits();
7443 SDValue Zero = DAG.getConstant(0, DL, EltVT);
7444 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7446 SmallVector<SDValue, 16> MaskOps;
7447 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7448 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
7449 return SDValue(); // Shuffled input!
7450 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
7453 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
7454 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
7455 // We have to cast V2 around.
7456 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
7457 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
7458 DAG.getBitcast(MaskVT, V1Mask),
7459 DAG.getBitcast(MaskVT, V2)));
7460 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
7463 /// \brief Try to emit a blend instruction for a shuffle.
7465 /// This doesn't do any checks for the availability of instructions for blending
7466 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7467 /// be matched in the backend with the type given. What it does check for is
7468 /// that the shuffle mask is a blend, or convertible into a blend with zero.
7469 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
7470 SDValue V2, ArrayRef<int> Original,
7471 const X86Subtarget &Subtarget,
7472 SelectionDAG &DAG) {
7473 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7474 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7475 SmallVector<int, 8> Mask(Original.begin(), Original.end());
7476 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7477 bool ForceV1Zero = false, ForceV2Zero = false;
7479 // Attempt to generate the binary blend mask. If an input is zero then
7480 // we can use any lane.
7481 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
7482 unsigned BlendMask = 0;
7483 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7489 if (M == i + Size) {
7490 BlendMask |= 1u << i;
7501 BlendMask |= 1u << i;
7506 return SDValue(); // Shuffled input!
7509 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
7511 V1 = getZeroVector(VT, Subtarget, DAG, DL);
7513 V2 = getZeroVector(VT, Subtarget, DAG, DL);
7515 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
7516 unsigned ScaledMask = 0;
7517 for (int i = 0; i != Size; ++i)
7518 if (BlendMask & (1u << i))
7519 for (int j = 0; j != Scale; ++j)
7520 ScaledMask |= 1u << (i * Scale + j);
7524 switch (VT.SimpleTy) {
7529 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7530 DAG.getConstant(BlendMask, DL, MVT::i8));
7534 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7538 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7539 // that instruction.
7540 if (Subtarget.hasAVX2()) {
7541 // Scale the blend by the number of 32-bit dwords per element.
7542 int Scale = VT.getScalarSizeInBits() / 32;
7543 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7544 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7545 V1 = DAG.getBitcast(BlendVT, V1);
7546 V2 = DAG.getBitcast(BlendVT, V2);
7547 return DAG.getBitcast(
7548 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7549 DAG.getConstant(BlendMask, DL, MVT::i8)));
7553 // For integer shuffles we need to expand the mask and cast the inputs to
7554 // v8i16s prior to blending.
7555 int Scale = 8 / VT.getVectorNumElements();
7556 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7557 V1 = DAG.getBitcast(MVT::v8i16, V1);
7558 V2 = DAG.getBitcast(MVT::v8i16, V2);
7559 return DAG.getBitcast(VT,
7560 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7561 DAG.getConstant(BlendMask, DL, MVT::i8)));
7565 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7566 SmallVector<int, 8> RepeatedMask;
7567 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7568 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7569 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7571 for (int i = 0; i < 8; ++i)
7572 if (RepeatedMask[i] >= 8)
7573 BlendMask |= 1u << i;
7574 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7575 DAG.getConstant(BlendMask, DL, MVT::i8));
7581 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
7582 "256-bit byte-blends require AVX2 support!");
7584 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
7585 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
7588 // Scale the blend by the number of bytes per element.
7589 int Scale = VT.getScalarSizeInBits() / 8;
7591 // This form of blend is always done on bytes. Compute the byte vector
7593 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
7595 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7596 // mix of LLVM's code generator and the x86 backend. We tell the code
7597 // generator that boolean values in the elements of an x86 vector register
7598 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7599 // mapping a select to operand #1, and 'false' mapping to operand #2. The
7600 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7601 // of the element (the remaining are ignored) and 0 in that high bit would
7602 // mean operand #1 while 1 in the high bit would mean operand #2. So while
7603 // the LLVM model for boolean values in vector elements gets the relevant
7604 // bit set, it is set backwards and over constrained relative to x86's
7606 SmallVector<SDValue, 32> VSELECTMask;
7607 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7608 for (int j = 0; j < Scale; ++j)
7609 VSELECTMask.push_back(
7610 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7611 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
7614 V1 = DAG.getBitcast(BlendVT, V1);
7615 V2 = DAG.getBitcast(BlendVT, V2);
7616 return DAG.getBitcast(
7617 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
7618 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
7622 llvm_unreachable("Not a supported integer vector type!");
7626 /// \brief Try to lower as a blend of elements from two inputs followed by
7627 /// a single-input permutation.
7629 /// This matches the pattern where we can blend elements from two inputs and
7630 /// then reduce the shuffle to a single-input permutation.
7631 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
7632 SDValue V1, SDValue V2,
7634 SelectionDAG &DAG) {
7635 // We build up the blend mask while checking whether a blend is a viable way
7636 // to reduce the shuffle.
7637 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7638 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
7640 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7644 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
7646 if (BlendMask[Mask[i] % Size] < 0)
7647 BlendMask[Mask[i] % Size] = Mask[i];
7648 else if (BlendMask[Mask[i] % Size] != Mask[i])
7649 return SDValue(); // Can't blend in the needed input!
7651 PermuteMask[i] = Mask[i] % Size;
7654 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7655 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
7658 /// \brief Generic routine to decompose a shuffle and blend into indepndent
7659 /// blends and permutes.
7661 /// This matches the extremely common pattern for handling combined
7662 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7663 /// operations. It will try to pick the best arrangement of shuffles and
7665 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
7669 SelectionDAG &DAG) {
7670 // Shuffle the input elements into the desired positions in V1 and V2 and
7671 // blend them together.
7672 SmallVector<int, 32> V1Mask(Mask.size(), -1);
7673 SmallVector<int, 32> V2Mask(Mask.size(), -1);
7674 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7675 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7676 if (Mask[i] >= 0 && Mask[i] < Size) {
7677 V1Mask[i] = Mask[i];
7679 } else if (Mask[i] >= Size) {
7680 V2Mask[i] = Mask[i] - Size;
7681 BlendMask[i] = i + Size;
7684 // Try to lower with the simpler initial blend strategy unless one of the
7685 // input shuffles would be a no-op. We prefer to shuffle inputs as the
7686 // shuffle may be able to fold with a load or other benefit. However, when
7687 // we'll have to do 2x as many shuffles in order to achieve this, blending
7688 // first is a better strategy.
7689 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
7690 if (SDValue BlendPerm =
7691 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
7694 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7695 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7696 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7699 /// \brief Try to lower a vector shuffle as a byte rotation.
7701 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7702 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7703 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7704 /// try to generically lower a vector shuffle through such an pattern. It
7705 /// does not check for the profitability of lowering either as PALIGNR or
7706 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7707 /// This matches shuffle vectors that look like:
7709 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7711 /// Essentially it concatenates V1 and V2, shifts right by some number of
7712 /// elements, and takes the low elements as the result. Note that while this is
7713 /// specified as a *right shift* because x86 is little-endian, it is a *left
7714 /// rotate* of the vector lanes.
7715 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
7716 SDValue V1, SDValue V2,
7718 const X86Subtarget &Subtarget,
7719 SelectionDAG &DAG) {
7720 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7722 int NumElts = Mask.size();
7723 int NumLanes = VT.getSizeInBits() / 128;
7724 int NumLaneElts = NumElts / NumLanes;
7726 // We need to detect various ways of spelling a rotation:
7727 // [11, 12, 13, 14, 15, 0, 1, 2]
7728 // [-1, 12, 13, 14, -1, -1, 1, -1]
7729 // [-1, -1, -1, -1, -1, -1, 1, 2]
7730 // [ 3, 4, 5, 6, 7, 8, 9, 10]
7731 // [-1, 4, 5, 6, -1, -1, 9, -1]
7732 // [-1, 4, 5, 6, -1, -1, -1, -1]
7735 for (int l = 0; l < NumElts; l += NumLaneElts) {
7736 for (int i = 0; i < NumLaneElts; ++i) {
7737 if (Mask[l + i] < 0)
7740 // Get the mod-Size index and lane correct it.
7741 int LaneIdx = (Mask[l + i] % NumElts) - l;
7742 // Make sure it was in this lane.
7743 if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
7746 // Determine where a rotated vector would have started.
7747 int StartIdx = i - LaneIdx;
7749 // The identity rotation isn't interesting, stop.
7752 // If we found the tail of a vector the rotation must be the missing
7753 // front. If we found the head of a vector, it must be how much of the
7755 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
7758 Rotation = CandidateRotation;
7759 else if (Rotation != CandidateRotation)
7760 // The rotations don't match, so we can't match this mask.
7763 // Compute which value this mask is pointing at.
7764 SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
7766 // Compute which of the two target values this index should be assigned
7767 // to. This reflects whether the high elements are remaining or the low
7768 // elements are remaining.
7769 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7771 // Either set up this value if we've not encountered it before, or check
7772 // that it remains consistent.
7775 else if (TargetV != MaskV)
7776 // This may be a rotation, but it pulls from the inputs in some
7777 // unsupported interleaving.
7782 // Check that we successfully analyzed the mask, and normalize the results.
7783 assert(Rotation != 0 && "Failed to locate a viable rotation!");
7784 assert((Lo || Hi) && "Failed to find a rotated input vector!");
7790 // Cast the inputs to i8 vector of correct length to match PALIGNR or
7792 MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
7793 Lo = DAG.getBitcast(ByteVT, Lo);
7794 Hi = DAG.getBitcast(ByteVT, Hi);
7796 // The actual rotate instruction rotates bytes, so we need to scale the
7797 // rotation based on how many bytes are in the vector lane.
7798 int Scale = 16 / NumLaneElts;
7800 // SSSE3 targets can use the palignr instruction.
7801 if (Subtarget.hasSSSE3()) {
7802 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
7803 "512-bit PALIGNR requires BWI instructions");
7804 return DAG.getBitcast(
7805 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
7806 DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
7809 assert(VT.is128BitVector() &&
7810 "Rotate-based lowering only supports 128-bit lowering!");
7811 assert(Mask.size() <= 16 &&
7812 "Can shuffle at most 16 bytes in a 128-bit vector!");
7813 assert(ByteVT == MVT::v16i8 &&
7814 "SSE2 rotate lowering only needed for v16i8!");
7816 // Default SSE2 implementation
7817 int LoByteShift = 16 - Rotation * Scale;
7818 int HiByteShift = Rotation * Scale;
7820 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
7821 DAG.getConstant(LoByteShift, DL, MVT::i8));
7822 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
7823 DAG.getConstant(HiByteShift, DL, MVT::i8));
7824 return DAG.getBitcast(VT,
7825 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
7828 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7830 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
7831 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
7832 /// matches elements from one of the input vectors shuffled to the left or
7833 /// right with zeroable elements 'shifted in'. It handles both the strictly
7834 /// bit-wise element shifts and the byte shift across an entire 128-bit double
7837 /// PSHL : (little-endian) left bit shift.
7838 /// [ zz, 0, zz, 2 ]
7839 /// [ -1, 4, zz, -1 ]
7840 /// PSRL : (little-endian) right bit shift.
7842 /// [ -1, -1, 7, zz]
7843 /// PSLLDQ : (little-endian) left byte shift
7844 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
7845 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
7846 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
7847 /// PSRLDQ : (little-endian) right byte shift
7848 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
7849 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
7850 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
7851 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
7852 SDValue V2, ArrayRef<int> Mask,
7853 const X86Subtarget &Subtarget,
7854 SelectionDAG &DAG) {
7855 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7857 int Size = Mask.size();
7858 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7860 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
7861 for (int i = 0; i < Size; i += Scale)
7862 for (int j = 0; j < Shift; ++j)
7863 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
7869 auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
7870 for (int i = 0; i != Size; i += Scale) {
7871 unsigned Pos = Left ? i + Shift : i;
7872 unsigned Low = Left ? i : i + Shift;
7873 unsigned Len = Scale - Shift;
7874 if (!isSequentialOrUndefInRange(Mask, Pos, Len,
7875 Low + (V == V1 ? 0 : Size)))
7879 int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
7880 bool ByteShift = ShiftEltBits > 64;
7881 unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
7882 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
7883 int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
7885 // Normalize the scale for byte shifts to still produce an i64 element
7887 Scale = ByteShift ? Scale / 2 : Scale;
7889 // We need to round trip through the appropriate type for the shift.
7890 MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7891 MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
7892 : MVT::getVectorVT(ShiftSVT, Size / Scale);
7893 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7894 "Illegal integer vector type");
7895 V = DAG.getBitcast(ShiftVT, V);
7897 V = DAG.getNode(OpCode, DL, ShiftVT, V,
7898 DAG.getConstant(ShiftAmt, DL, MVT::i8));
7899 return DAG.getBitcast(VT, V);
7902 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7903 // keep doubling the size of the integer elements up to that. We can
7904 // then shift the elements of the integer vector by whole multiples of
7905 // their width within the elements of the larger integer vector. Test each
7906 // multiple to see if we can find a match with the moved element indices
7907 // and that the shifted in elements are all zeroable.
7908 unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
7909 for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
7910 for (int Shift = 1; Shift != Scale; ++Shift)
7911 for (bool Left : {true, false})
7912 if (CheckZeros(Shift, Scale, Left))
7913 for (SDValue V : {V1, V2})
7914 if (SDValue Match = MatchShift(Shift, Scale, Left, V))
7921 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
7922 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
7923 SDValue V2, ArrayRef<int> Mask,
7924 SelectionDAG &DAG) {
7925 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7926 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
7928 int Size = Mask.size();
7929 int HalfSize = Size / 2;
7930 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7932 // Upper half must be undefined.
7933 if (!isUndefInRange(Mask, HalfSize, HalfSize))
7936 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
7937 // Remainder of lower half result is zero and upper half is all undef.
7938 auto LowerAsEXTRQ = [&]() {
7939 // Determine the extraction length from the part of the
7940 // lower half that isn't zeroable.
7942 for (; Len > 0; --Len)
7943 if (!Zeroable[Len - 1])
7945 assert(Len > 0 && "Zeroable shuffle mask");
7947 // Attempt to match first Len sequential elements from the lower half.
7950 for (int i = 0; i != Len; ++i) {
7954 SDValue &V = (M < Size ? V1 : V2);
7957 // The extracted elements must start at a valid index and all mask
7958 // elements must be in the lower half.
7959 if (i > M || M >= HalfSize)
7962 if (Idx < 0 || (Src == V && Idx == (M - i))) {
7973 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
7974 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
7975 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
7976 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
7977 DAG.getConstant(BitLen, DL, MVT::i8),
7978 DAG.getConstant(BitIdx, DL, MVT::i8));
7981 if (SDValue ExtrQ = LowerAsEXTRQ())
7984 // INSERTQ: Extract lowest Len elements from lower half of second source and
7985 // insert over first source, starting at Idx.
7986 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
7987 auto LowerAsInsertQ = [&]() {
7988 for (int Idx = 0; Idx != HalfSize; ++Idx) {
7991 // Attempt to match first source from mask before insertion point.
7992 if (isUndefInRange(Mask, 0, Idx)) {
7994 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
7996 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
8002 // Extend the extraction length looking to match both the insertion of
8003 // the second source and the remaining elements of the first.
8004 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8009 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8011 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8017 // Match the remaining elements of the lower half.
8018 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8020 } else if ((!Base || (Base == V1)) &&
8021 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8023 } else if ((!Base || (Base == V2)) &&
8024 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8031 // We may not have a base (first source) - this can safely be undefined.
8033 Base = DAG.getUNDEF(VT);
8035 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8036 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8037 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8038 DAG.getConstant(BitLen, DL, MVT::i8),
8039 DAG.getConstant(BitIdx, DL, MVT::i8));
8046 if (SDValue InsertQ = LowerAsInsertQ())
8052 /// \brief Lower a vector shuffle as a zero or any extension.
8054 /// Given a specific number of elements, element bit width, and extension
8055 /// stride, produce either a zero or any extension based on the available
8056 /// features of the subtarget. The extended elements are consecutive and
8057 /// begin and can start from an offseted element index in the input; to
8058 /// avoid excess shuffling the offset must either being in the bottom lane
8059 /// or at the start of a higher lane. All extended elements must be from
8061 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8062 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8063 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8064 assert(Scale > 1 && "Need a scale to extend.");
8065 int EltBits = VT.getScalarSizeInBits();
8066 int NumElements = VT.getVectorNumElements();
8067 int NumEltsPerLane = 128 / EltBits;
8068 int OffsetLane = Offset / NumEltsPerLane;
8069 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8070 "Only 8, 16, and 32 bit elements can be extended.");
8071 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8072 assert(0 <= Offset && "Extension offset must be positive.");
8073 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8074 "Extension offset must be in the first lane or start an upper lane.");
8076 // Check that an index is in same lane as the base offset.
8077 auto SafeOffset = [&](int Idx) {
8078 return OffsetLane == (Idx / NumEltsPerLane);
8081 // Shift along an input so that the offset base moves to the first element.
8082 auto ShuffleOffset = [&](SDValue V) {
8086 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8087 for (int i = 0; i * Scale < NumElements; ++i) {
8088 int SrcIdx = i + Offset;
8089 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8091 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8094 // Found a valid zext mask! Try various lowering strategies based on the
8095 // input type and available ISA extensions.
8096 if (Subtarget.hasSSE41()) {
8097 // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8098 // PUNPCK will catch this in a later shuffle match.
8099 if (Offset && Scale == 2 && VT.is128BitVector())
8101 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8102 NumElements / Scale);
8103 InputV = ShuffleOffset(InputV);
8105 // For 256-bit vectors, we only need the lower (128-bit) input half.
8106 if (VT.is256BitVector())
8107 InputV = extract128BitVector(InputV, 0, DAG, DL);
8109 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8110 return DAG.getBitcast(VT, InputV);
8113 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8115 // For any extends we can cheat for larger element sizes and use shuffle
8116 // instructions that can fold with a load and/or copy.
8117 if (AnyExt && EltBits == 32) {
8118 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8120 return DAG.getBitcast(
8121 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8122 DAG.getBitcast(MVT::v4i32, InputV),
8123 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8125 if (AnyExt && EltBits == 16 && Scale > 2) {
8126 int PSHUFDMask[4] = {Offset / 2, -1,
8127 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8128 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8129 DAG.getBitcast(MVT::v4i32, InputV),
8130 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8131 int PSHUFWMask[4] = {1, -1, -1, -1};
8132 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8133 return DAG.getBitcast(
8134 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8135 DAG.getBitcast(MVT::v8i16, InputV),
8136 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8139 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8141 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8142 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8143 assert(VT.is128BitVector() && "Unexpected vector width!");
8145 int LoIdx = Offset * EltBits;
8146 SDValue Lo = DAG.getBitcast(
8147 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8148 DAG.getConstant(EltBits, DL, MVT::i8),
8149 DAG.getConstant(LoIdx, DL, MVT::i8)));
8151 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8152 !SafeOffset(Offset + 1))
8153 return DAG.getBitcast(VT, Lo);
8155 int HiIdx = (Offset + 1) * EltBits;
8156 SDValue Hi = DAG.getBitcast(
8157 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8158 DAG.getConstant(EltBits, DL, MVT::i8),
8159 DAG.getConstant(HiIdx, DL, MVT::i8)));
8160 return DAG.getBitcast(VT,
8161 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8164 // If this would require more than 2 unpack instructions to expand, use
8165 // pshufb when available. We can only use more than 2 unpack instructions
8166 // when zero extending i8 elements which also makes it easier to use pshufb.
8167 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8168 assert(NumElements == 16 && "Unexpected byte vector width!");
8169 SDValue PSHUFBMask[16];
8170 for (int i = 0; i < 16; ++i) {
8171 int Idx = Offset + (i / Scale);
8172 PSHUFBMask[i] = DAG.getConstant(
8173 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8175 InputV = DAG.getBitcast(MVT::v16i8, InputV);
8176 return DAG.getBitcast(
8177 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8178 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8181 // If we are extending from an offset, ensure we start on a boundary that
8182 // we can unpack from.
8183 int AlignToUnpack = Offset % (NumElements / Scale);
8184 if (AlignToUnpack) {
8185 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8186 for (int i = AlignToUnpack; i < NumElements; ++i)
8187 ShMask[i - AlignToUnpack] = i;
8188 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8189 Offset -= AlignToUnpack;
8192 // Otherwise emit a sequence of unpacks.
8194 unsigned UnpackLoHi = X86ISD::UNPCKL;
8195 if (Offset >= (NumElements / 2)) {
8196 UnpackLoHi = X86ISD::UNPCKH;
8197 Offset -= (NumElements / 2);
8200 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8201 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8202 : getZeroVector(InputVT, Subtarget, DAG, DL);
8203 InputV = DAG.getBitcast(InputVT, InputV);
8204 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8208 } while (Scale > 1);
8209 return DAG.getBitcast(VT, InputV);
8212 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8214 /// This routine will try to do everything in its power to cleverly lower
8215 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8216 /// check for the profitability of this lowering, it tries to aggressively
8217 /// match this pattern. It will use all of the micro-architectural details it
8218 /// can to emit an efficient lowering. It handles both blends with all-zero
8219 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8220 /// masking out later).
8222 /// The reason we have dedicated lowering for zext-style shuffles is that they
8223 /// are both incredibly common and often quite performance sensitive.
8224 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8225 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8226 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8227 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8229 int Bits = VT.getSizeInBits();
8230 int NumLanes = Bits / 128;
8231 int NumElements = VT.getVectorNumElements();
8232 int NumEltsPerLane = NumElements / NumLanes;
8233 assert(VT.getScalarSizeInBits() <= 32 &&
8234 "Exceeds 32-bit integer zero extension limit");
8235 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8237 // Define a helper function to check a particular ext-scale and lower to it if
8239 auto Lower = [&](int Scale) -> SDValue {
8244 for (int i = 0; i < NumElements; ++i) {
8247 continue; // Valid anywhere but doesn't tell us anything.
8248 if (i % Scale != 0) {
8249 // Each of the extended elements need to be zeroable.
8253 // We no longer are in the anyext case.
8258 // Each of the base elements needs to be consecutive indices into the
8259 // same input vector.
8260 SDValue V = M < NumElements ? V1 : V2;
8261 M = M % NumElements;
8264 Offset = M - (i / Scale);
8265 } else if (InputV != V)
8266 return SDValue(); // Flip-flopping inputs.
8268 // Offset must start in the lowest 128-bit lane or at the start of an
8270 // FIXME: Is it ever worth allowing a negative base offset?
8271 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
8272 (Offset % NumEltsPerLane) == 0))
8275 // If we are offsetting, all referenced entries must come from the same
8277 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
8280 if ((M % NumElements) != (Offset + (i / Scale)))
8281 return SDValue(); // Non-consecutive strided elements.
8285 // If we fail to find an input, we have a zero-shuffle which should always
8286 // have already been handled.
8287 // FIXME: Maybe handle this here in case during blending we end up with one?
8291 // If we are offsetting, don't extend if we only match a single input, we
8292 // can always do better by using a basic PSHUF or PUNPCK.
8293 if (Offset != 0 && Matches < 2)
8296 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8297 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
8300 // The widest scale possible for extending is to a 64-bit integer.
8301 assert(Bits % 64 == 0 &&
8302 "The number of bits in a vector must be divisible by 64 on x86!");
8303 int NumExtElements = Bits / 64;
8305 // Each iteration, try extending the elements half as much, but into twice as
8307 for (; NumExtElements < NumElements; NumExtElements *= 2) {
8308 assert(NumElements % NumExtElements == 0 &&
8309 "The input vector size must be divisible by the extended size.");
8310 if (SDValue V = Lower(NumElements / NumExtElements))
8314 // General extends failed, but 128-bit vectors may be able to use MOVQ.
8318 // Returns one of the source operands if the shuffle can be reduced to a
8319 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8320 auto CanZExtLowHalf = [&]() {
8321 for (int i = NumElements / 2; i != NumElements; ++i)
8324 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8326 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8331 if (SDValue V = CanZExtLowHalf()) {
8332 V = DAG.getBitcast(MVT::v2i64, V);
8333 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8334 return DAG.getBitcast(VT, V);
8337 // No viable ext lowering found.
8341 /// \brief Try to get a scalar value for a specific element of a vector.
8343 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8344 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8345 SelectionDAG &DAG) {
8346 MVT VT = V.getSimpleValueType();
8347 MVT EltVT = VT.getVectorElementType();
8348 V = peekThroughBitcasts(V);
8350 // If the bitcasts shift the element size, we can't extract an equivalent
8352 MVT NewVT = V.getSimpleValueType();
8353 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8356 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8357 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
8358 // Ensure the scalar operand is the same size as the destination.
8359 // FIXME: Add support for scalar truncation where possible.
8360 SDValue S = V.getOperand(Idx);
8361 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
8362 return DAG.getBitcast(EltVT, S);
8368 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8370 /// This is particularly important because the set of instructions varies
8371 /// significantly based on whether the operand is a load or not.
8372 static bool isShuffleFoldableLoad(SDValue V) {
8373 V = peekThroughBitcasts(V);
8374 return ISD::isNON_EXTLoad(V.getNode());
8377 /// \brief Try to lower insertion of a single element into a zero vector.
8379 /// This is a common pattern that we have especially efficient patterns to lower
8380 /// across all subtarget feature sets.
8381 static SDValue lowerVectorShuffleAsElementInsertion(
8382 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8383 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8384 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8386 MVT EltVT = VT.getVectorElementType();
8388 int V2Index = std::find_if(Mask.begin(), Mask.end(),
8389 [&Mask](int M) { return M >= (int)Mask.size(); }) -
8391 bool IsV1Zeroable = true;
8392 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8393 if (i != V2Index && !Zeroable[i]) {
8394 IsV1Zeroable = false;
8398 // Check for a single input from a SCALAR_TO_VECTOR node.
8399 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8400 // all the smarts here sunk into that routine. However, the current
8401 // lowering of BUILD_VECTOR makes that nearly impossible until the old
8402 // vector shuffle lowering is dead.
8403 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
8405 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
8406 // We need to zext the scalar if it is smaller than an i32.
8407 V2S = DAG.getBitcast(EltVT, V2S);
8408 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8409 // Using zext to expand a narrow element won't work for non-zero
8414 // Zero-extend directly to i32.
8416 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8418 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8419 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8420 EltVT == MVT::i16) {
8421 // Either not inserting from the low element of the input or the input
8422 // element size is too small to use VZEXT_MOVL to clear the high bits.
8426 if (!IsV1Zeroable) {
8427 // If V1 can't be treated as a zero vector we have fewer options to lower
8428 // this. We can't support integer vectors or non-zero targets cheaply, and
8429 // the V1 elements can't be permuted in any way.
8430 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8431 if (!VT.isFloatingPoint() || V2Index != 0)
8433 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8434 V1Mask[V2Index] = -1;
8435 if (!isNoopShuffleMask(V1Mask))
8437 // This is essentially a special case blend operation, but if we have
8438 // general purpose blend operations, they are always faster. Bail and let
8439 // the rest of the lowering handle these as blends.
8440 if (Subtarget.hasSSE41())
8443 // Otherwise, use MOVSD or MOVSS.
8444 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8445 "Only two types of floating point element types to handle!");
8446 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8450 // This lowering only works for the low element with floating point vectors.
8451 if (VT.isFloatingPoint() && V2Index != 0)
8454 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8456 V2 = DAG.getBitcast(VT, V2);
8459 // If we have 4 or fewer lanes we can cheaply shuffle the element into
8460 // the desired position. Otherwise it is more efficient to do a vector
8461 // shift left. We know that we can do a vector shift left because all
8462 // the inputs are zero.
8463 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8464 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8465 V2Shuffle[V2Index] = 0;
8466 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8468 V2 = DAG.getBitcast(MVT::v16i8, V2);
8470 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
8471 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
8472 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
8473 DAG.getDataLayout(), VT)));
8474 V2 = DAG.getBitcast(VT, V2);
8480 /// Try to lower broadcast of a single - truncated - integer element,
8481 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
8483 /// This assumes we have AVX2.
8484 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
8485 SDValue V0, int BroadcastIdx,
8486 const X86Subtarget &Subtarget,
8487 SelectionDAG &DAG) {
8488 assert(Subtarget.hasAVX2() &&
8489 "We can only lower integer broadcasts with AVX2!");
8491 EVT EltVT = VT.getVectorElementType();
8492 EVT V0VT = V0.getValueType();
8494 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
8495 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
8497 EVT V0EltVT = V0VT.getVectorElementType();
8498 if (!V0EltVT.isInteger())
8501 const unsigned EltSize = EltVT.getSizeInBits();
8502 const unsigned V0EltSize = V0EltVT.getSizeInBits();
8504 // This is only a truncation if the original element type is larger.
8505 if (V0EltSize <= EltSize)
8508 assert(((V0EltSize % EltSize) == 0) &&
8509 "Scalar type sizes must all be powers of 2 on x86!");
8511 const unsigned V0Opc = V0.getOpcode();
8512 const unsigned Scale = V0EltSize / EltSize;
8513 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
8515 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
8516 V0Opc != ISD::BUILD_VECTOR)
8519 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
8521 // If we're extracting non-least-significant bits, shift so we can truncate.
8522 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
8523 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
8524 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
8525 if (const int OffsetIdx = BroadcastIdx % Scale)
8526 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
8527 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
8529 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
8530 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
8533 /// \brief Try to lower broadcast of a single element.
8535 /// For convenience, this code also bundles all of the subtarget feature set
8536 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8537 /// a convenient way to factor it out.
8538 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
8539 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
8540 SDValue V1, SDValue V2,
8542 const X86Subtarget &Subtarget,
8543 SelectionDAG &DAG) {
8544 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
8545 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
8546 (Subtarget.hasAVX2() && VT.isInteger())))
8549 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
8550 // we can only broadcast from a register with AVX2.
8551 unsigned NumElts = Mask.size();
8552 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
8553 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
8555 // Check that the mask is a broadcast.
8556 int BroadcastIdx = -1;
8557 for (int i = 0; i != (int)NumElts; ++i) {
8558 SmallVector<int, 8> BroadcastMask(NumElts, i);
8559 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
8565 if (BroadcastIdx < 0)
8567 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8568 "a sorted mask where the broadcast "
8571 // Go up the chain of (vector) values to find a scalar load that we can
8572 // combine with the broadcast.
8575 switch (V.getOpcode()) {
8576 case ISD::BITCAST: {
8577 SDValue VSrc = V.getOperand(0);
8578 MVT SrcVT = VSrc.getSimpleValueType();
8579 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
8584 case ISD::CONCAT_VECTORS: {
8585 int OperandSize = Mask.size() / V.getNumOperands();
8586 V = V.getOperand(BroadcastIdx / OperandSize);
8587 BroadcastIdx %= OperandSize;
8590 case ISD::INSERT_SUBVECTOR: {
8591 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8592 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8596 int BeginIdx = (int)ConstantIdx->getZExtValue();
8598 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
8599 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8600 BroadcastIdx -= BeginIdx;
8611 // Check if this is a broadcast of a scalar. We special case lowering
8612 // for scalars so that we can more effectively fold with loads.
8613 // First, look through bitcast: if the original value has a larger element
8614 // type than the shuffle, the broadcast element is in essence truncated.
8615 // Make that explicit to ease folding.
8616 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
8617 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
8618 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
8619 return TruncBroadcast;
8621 MVT BroadcastVT = VT;
8623 // Peek through any bitcast (only useful for loads).
8624 SDValue BC = peekThroughBitcasts(V);
8626 // Also check the simpler case, where we can directly reuse the scalar.
8627 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8628 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8629 V = V.getOperand(BroadcastIdx);
8631 // If we can't broadcast from a register, check that the input is a load.
8632 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
8634 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
8635 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
8636 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
8637 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
8638 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
8641 // If we are broadcasting a load that is only used by the shuffle
8642 // then we can reduce the vector load to the broadcasted scalar load.
8643 LoadSDNode *Ld = cast<LoadSDNode>(BC);
8644 SDValue BaseAddr = Ld->getOperand(1);
8645 EVT SVT = BroadcastVT.getScalarType();
8646 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
8647 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
8648 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
8649 DAG.getMachineFunction().getMachineMemOperand(
8650 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
8651 } else if (!BroadcastFromReg) {
8652 // We can't broadcast from a vector register.
8654 } else if (BroadcastIdx != 0) {
8655 // We can only broadcast from the zero-element of a vector register,
8656 // but it can be advantageous to broadcast from the zero-element of a
8658 if (!VT.is256BitVector() && !VT.is512BitVector())
8661 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
8662 if (VT == MVT::v4f64 || VT == MVT::v4i64)
8665 // Only broadcast the zero-element of a 128-bit subvector.
8666 unsigned EltSize = VT.getScalarSizeInBits();
8667 if (((BroadcastIdx * EltSize) % 128) != 0)
8670 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
8671 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
8672 DAG.getIntPtrConstant(BroadcastIdx, DL));
8675 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
8676 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
8677 DAG.getBitcast(MVT::f64, V));
8679 // Bitcast back to the same scalar type as BroadcastVT.
8680 MVT SrcVT = V.getSimpleValueType();
8681 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
8682 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
8683 "Unexpected vector element size");
8684 if (SrcVT.isVector()) {
8685 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8686 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
8688 SrcVT = BroadcastVT.getScalarType();
8690 V = DAG.getBitcast(SrcVT, V);
8693 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
8696 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8697 // INSERTPS when the V1 elements are already in the correct locations
8698 // because otherwise we can just always use two SHUFPS instructions which
8699 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8700 // perform INSERTPS if a single V1 element is out of place and all V2
8701 // elements are zeroable.
8702 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
8703 unsigned &InsertPSMask,
8704 const SmallBitVector &Zeroable,
8706 SelectionDAG &DAG) {
8707 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
8708 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
8709 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8711 int V1DstIndex = -1;
8712 int V2DstIndex = -1;
8713 bool V1UsedInPlace = false;
8715 for (int i = 0; i < 4; ++i) {
8716 // Synthesize a zero mask from the zeroable elements (includes undefs).
8722 // Flag if we use any V1 inputs in place.
8724 V1UsedInPlace = true;
8728 // We can only insert a single non-zeroable element.
8729 if (V1DstIndex >= 0 || V2DstIndex >= 0)
8733 // V1 input out of place for insertion.
8736 // V2 input for insertion.
8741 // Don't bother if we have no (non-zeroable) element for insertion.
8742 if (V1DstIndex < 0 && V2DstIndex < 0)
8745 // Determine element insertion src/dst indices. The src index is from the
8746 // start of the inserted vector, not the start of the concatenated vector.
8747 unsigned V2SrcIndex = 0;
8748 if (V1DstIndex >= 0) {
8749 // If we have a V1 input out of place, we use V1 as the V2 element insertion
8750 // and don't use the original V2 at all.
8751 V2SrcIndex = Mask[V1DstIndex];
8752 V2DstIndex = V1DstIndex;
8755 V2SrcIndex = Mask[V2DstIndex] - 4;
8758 // If no V1 inputs are used in place, then the result is created only from
8759 // the zero mask and the V2 insertion - so remove V1 dependency.
8761 V1 = DAG.getUNDEF(MVT::v4f32);
8763 // Insert the V2 element into the desired position.
8764 InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8765 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8769 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
8770 SDValue V2, ArrayRef<int> Mask,
8771 SelectionDAG &DAG) {
8772 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8773 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8774 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8776 // Attempt to match the insertps pattern.
8777 unsigned InsertPSMask;
8778 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
8781 // Insert the V2 element into the desired position.
8782 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8783 DAG.getConstant(InsertPSMask, DL, MVT::i8));
8786 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
8787 /// UNPCK instruction.
8789 /// This specifically targets cases where we end up with alternating between
8790 /// the two inputs, and so can permute them into something that feeds a single
8791 /// UNPCK instruction. Note that this routine only targets integer vectors
8792 /// because for floating point vectors we have a generalized SHUFPS lowering
8793 /// strategy that handles everything that doesn't *exactly* match an unpack,
8794 /// making this clever lowering unnecessary.
8795 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
8796 SDValue V1, SDValue V2,
8798 SelectionDAG &DAG) {
8799 assert(!VT.isFloatingPoint() &&
8800 "This routine only supports integer vectors.");
8801 assert(VT.is128BitVector() &&
8802 "This routine only works on 128-bit vectors.");
8803 assert(!V2.isUndef() &&
8804 "This routine should only be used when blending two inputs.");
8805 assert(Mask.size() >= 2 && "Single element masks are invalid.");
8807 int Size = Mask.size();
8810 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
8812 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
8814 bool UnpackLo = NumLoInputs >= NumHiInputs;
8816 auto TryUnpack = [&](int ScalarSize, int Scale) {
8817 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
8818 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
8820 for (int i = 0; i < Size; ++i) {
8824 // Each element of the unpack contains Scale elements from this mask.
8825 int UnpackIdx = i / Scale;
8827 // We only handle the case where V1 feeds the first slots of the unpack.
8828 // We rely on canonicalization to ensure this is the case.
8829 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
8832 // Setup the mask for this input. The indexing is tricky as we have to
8833 // handle the unpack stride.
8834 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
8835 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
8839 // If we will have to shuffle both inputs to use the unpack, check whether
8840 // we can just unpack first and shuffle the result. If so, skip this unpack.
8841 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
8842 !isNoopShuffleMask(V2Mask))
8845 // Shuffle the inputs into place.
8846 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8847 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8849 // Cast the inputs to the type we will use to unpack them.
8850 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
8851 V1 = DAG.getBitcast(UnpackVT, V1);
8852 V2 = DAG.getBitcast(UnpackVT, V2);
8854 // Unpack the inputs and cast the result back to the desired type.
8855 return DAG.getBitcast(
8856 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
8860 // We try each unpack from the largest to the smallest to try and find one
8861 // that fits this mask.
8862 int OrigScalarSize = VT.getScalarSizeInBits();
8863 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
8864 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
8867 // If none of the unpack-rooted lowerings worked (or were profitable) try an
8869 if (NumLoInputs == 0 || NumHiInputs == 0) {
8870 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
8871 "We have to have *some* inputs!");
8872 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
8874 // FIXME: We could consider the total complexity of the permute of each
8875 // possible unpacking. Or at the least we should consider how many
8876 // half-crossings are created.
8877 // FIXME: We could consider commuting the unpacks.
8879 SmallVector<int, 32> PermMask((unsigned)Size, -1);
8880 for (int i = 0; i < Size; ++i) {
8884 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
8887 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
8889 return DAG.getVectorShuffle(
8890 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
8892 DAG.getUNDEF(VT), PermMask);
8898 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8900 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8901 /// support for floating point shuffles but not integer shuffles. These
8902 /// instructions will incur a domain crossing penalty on some chips though so
8903 /// it is better to avoid lowering through this for integer vectors where
8905 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8906 SDValue V1, SDValue V2,
8907 const X86Subtarget &Subtarget,
8908 SelectionDAG &DAG) {
8909 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8910 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8911 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8914 // Check for being able to broadcast a single element.
8915 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8916 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8919 // Straight shuffle of a single input vector. Simulate this by using the
8920 // single input as both of the "inputs" to this instruction..
8921 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8923 if (Subtarget.hasAVX()) {
8924 // If we have AVX, we can use VPERMILPS which will allow folding a load
8925 // into the shuffle.
8926 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8927 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8930 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
8931 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8933 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8934 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8936 // If we have a single input, insert that into V1 if we can do so cheaply.
8937 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8938 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8939 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8941 // Try inverting the insertion since for v2 masks it is easy to do and we
8942 // can't reliably sort the mask one way or the other.
8943 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8944 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8945 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8946 DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
8950 // Try to use one of the special instruction patterns to handle two common
8951 // blend patterns if a zero-blend above didn't work.
8952 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
8953 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
8954 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8955 // We can either use a special instruction to load over the low double or
8956 // to move just the low double.
8958 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8960 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8962 if (Subtarget.hasSSE41())
8963 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8967 // Use dedicated unpack instructions for masks that match their pattern.
8969 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
8972 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8973 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
8974 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8977 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8979 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8980 /// the integer unit to minimize domain crossing penalties. However, for blends
8981 /// it falls back to the floating point shuffle operation with appropriate bit
8983 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8984 SDValue V1, SDValue V2,
8985 const X86Subtarget &Subtarget,
8986 SelectionDAG &DAG) {
8987 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8988 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8989 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8992 // Check for being able to broadcast a single element.
8993 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8994 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8997 // Straight shuffle of a single input vector. For everything from SSE2
8998 // onward this has a single fast instruction with no scary immediates.
8999 // We have to map the mask as it is actually a v4i32 shuffle instruction.
9000 V1 = DAG.getBitcast(MVT::v4i32, V1);
9001 int WidenedMask[4] = {
9002 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9003 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9004 return DAG.getBitcast(
9006 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9007 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9009 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9010 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9011 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9012 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9014 // If we have a blend of two same-type PACKUS operations and the blend aligns
9015 // with the low and high halves, we can just merge the PACKUS operations.
9016 // This is particularly important as it lets us merge shuffles that this
9017 // routine itself creates.
9018 auto GetPackNode = [](SDValue V) {
9019 V = peekThroughBitcasts(V);
9020 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9022 if (SDValue V1Pack = GetPackNode(V1))
9023 if (SDValue V2Pack = GetPackNode(V2)) {
9024 EVT PackVT = V1Pack.getValueType();
9025 if (PackVT == V2Pack.getValueType())
9026 return DAG.getBitcast(MVT::v2i64,
9027 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9028 Mask[0] == 0 ? V1Pack.getOperand(0)
9029 : V1Pack.getOperand(1),
9030 Mask[1] == 2 ? V2Pack.getOperand(0)
9031 : V2Pack.getOperand(1)));
9034 // Try to use shift instructions.
9035 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9039 // When loading a scalar and then shuffling it into a vector we can often do
9040 // the insertion cheaply.
9041 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9042 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9044 // Try inverting the insertion since for v2 masks it is easy to do and we
9045 // can't reliably sort the mask one way or the other.
9046 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9047 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9048 DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
9051 // We have different paths for blend lowering, but they all must use the
9052 // *exact* same predicate.
9053 bool IsBlendSupported = Subtarget.hasSSE41();
9054 if (IsBlendSupported)
9055 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9059 // Use dedicated unpack instructions for masks that match their pattern.
9061 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9064 // Try to use byte rotation instructions.
9065 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9066 if (Subtarget.hasSSSE3())
9067 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9068 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9071 // If we have direct support for blends, we should lower by decomposing into
9072 // a permute. That will be faster than the domain cross.
9073 if (IsBlendSupported)
9074 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9077 // We implement this with SHUFPD which is pretty lame because it will likely
9078 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9079 // However, all the alternatives are still more cycles and newer chips don't
9080 // have this problem. It would be really nice if x86 had better shuffles here.
9081 V1 = DAG.getBitcast(MVT::v2f64, V1);
9082 V2 = DAG.getBitcast(MVT::v2f64, V2);
9083 return DAG.getBitcast(MVT::v2i64,
9084 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9087 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9089 /// This is used to disable more specialized lowerings when the shufps lowering
9090 /// will happen to be efficient.
9091 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9092 // This routine only handles 128-bit shufps.
9093 assert(Mask.size() == 4 && "Unsupported mask size!");
9094 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9095 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9096 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9097 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9099 // To lower with a single SHUFPS we need to have the low half and high half
9100 // each requiring a single input.
9101 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9103 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9109 /// \brief Lower a vector shuffle using the SHUFPS instruction.
9111 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9112 /// It makes no assumptions about whether this is the *best* lowering, it simply
9114 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9115 ArrayRef<int> Mask, SDValue V1,
9116 SDValue V2, SelectionDAG &DAG) {
9117 SDValue LowV = V1, HighV = V2;
9118 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9120 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9122 if (NumV2Elements == 1) {
9124 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
9127 // Compute the index adjacent to V2Index and in the same half by toggling
9129 int V2AdjIndex = V2Index ^ 1;
9131 if (Mask[V2AdjIndex] < 0) {
9132 // Handles all the cases where we have a single V2 element and an undef.
9133 // This will only ever happen in the high lanes because we commute the
9134 // vector otherwise.
9136 std::swap(LowV, HighV);
9137 NewMask[V2Index] -= 4;
9139 // Handle the case where the V2 element ends up adjacent to a V1 element.
9140 // To make this work, blend them together as the first step.
9141 int V1Index = V2AdjIndex;
9142 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9143 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9144 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9146 // Now proceed to reconstruct the final blend as we have the necessary
9147 // high or low half formed.
9154 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9155 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9157 } else if (NumV2Elements == 2) {
9158 if (Mask[0] < 4 && Mask[1] < 4) {
9159 // Handle the easy case where we have V1 in the low lanes and V2 in the
9163 } else if (Mask[2] < 4 && Mask[3] < 4) {
9164 // We also handle the reversed case because this utility may get called
9165 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9166 // arrange things in the right direction.
9172 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
9173 // trying to place elements directly, just blend them and set up the final
9174 // shuffle to place them.
9176 // The first two blend mask elements are for V1, the second two are for
9178 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
9179 Mask[2] < 4 ? Mask[2] : Mask[3],
9180 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
9181 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
9182 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
9183 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9185 // Now we do a normal shuffle of V1 by giving V1 as both operands to
9188 NewMask[0] = Mask[0] < 4 ? 0 : 2;
9189 NewMask[1] = Mask[0] < 4 ? 2 : 0;
9190 NewMask[2] = Mask[2] < 4 ? 1 : 3;
9191 NewMask[3] = Mask[2] < 4 ? 3 : 1;
9194 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
9195 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
9198 /// \brief Lower 4-lane 32-bit floating point shuffles.
9200 /// Uses instructions exclusively from the floating point unit to minimize
9201 /// domain crossing penalties, as these are sufficient to implement all v4f32
9203 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9204 SDValue V1, SDValue V2,
9205 const X86Subtarget &Subtarget,
9206 SelectionDAG &DAG) {
9207 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9208 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9209 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9211 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9213 if (NumV2Elements == 0) {
9214 // Check for being able to broadcast a single element.
9215 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9216 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
9219 // Use even/odd duplicate instructions for masks that match their pattern.
9220 if (Subtarget.hasSSE3()) {
9221 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
9222 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
9223 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
9224 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
9227 if (Subtarget.hasAVX()) {
9228 // If we have AVX, we can use VPERMILPS which will allow folding a load
9229 // into the shuffle.
9230 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
9231 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9234 // Otherwise, use a straight shuffle of a single input vector. We pass the
9235 // input vector to both operands to simulate this with a SHUFPS.
9236 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
9237 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9240 // There are special ways we can lower some single-element blends. However, we
9241 // have custom ways we can lower more complex single-element blends below that
9242 // we defer to if both this and BLENDPS fail to match, so restrict this to
9243 // when the V2 input is targeting element 0 of the mask -- that is the fast
9245 if (NumV2Elements == 1 && Mask[0] >= 4)
9246 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
9247 Mask, Subtarget, DAG))
9250 if (Subtarget.hasSSE41()) {
9251 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
9255 // Use INSERTPS if we can complete the shuffle efficiently.
9256 if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
9259 if (!isSingleSHUFPSMask(Mask))
9260 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
9261 DL, MVT::v4f32, V1, V2, Mask, DAG))
9265 // Use low/high mov instructions.
9266 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
9267 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
9268 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
9269 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
9271 // Use dedicated unpack instructions for masks that match their pattern.
9273 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
9276 // Otherwise fall back to a SHUFPS lowering strategy.
9277 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
9280 /// \brief Lower 4-lane i32 vector shuffles.
9282 /// We try to handle these with integer-domain shuffles where we can, but for
9283 /// blends we use the floating point domain blend instructions.
9284 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9285 SDValue V1, SDValue V2,
9286 const X86Subtarget &Subtarget,
9287 SelectionDAG &DAG) {
9288 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9289 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9290 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9292 // Whenever we can lower this as a zext, that instruction is strictly faster
9293 // than any alternative. It also allows us to fold memory operands into the
9294 // shuffle in many cases.
9295 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
9296 Mask, Subtarget, DAG))
9299 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9301 if (NumV2Elements == 0) {
9302 // Check for being able to broadcast a single element.
9303 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9304 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9307 // Straight shuffle of a single input vector. For everything from SSE2
9308 // onward this has a single fast instruction with no scary immediates.
9309 // We coerce the shuffle pattern to be compatible with UNPCK instructions
9310 // but we aren't actually going to use the UNPCK instruction because doing
9311 // so prevents folding a load into this instruction or making a copy.
9312 const int UnpackLoMask[] = {0, 0, 1, 1};
9313 const int UnpackHiMask[] = {2, 2, 3, 3};
9314 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
9315 Mask = UnpackLoMask;
9316 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
9317 Mask = UnpackHiMask;
9319 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9320 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9323 // Try to use shift instructions.
9324 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
9328 // There are special ways we can lower some single-element blends.
9329 if (NumV2Elements == 1)
9330 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
9331 Mask, Subtarget, DAG))
9334 // We have different paths for blend lowering, but they all must use the
9335 // *exact* same predicate.
9336 bool IsBlendSupported = Subtarget.hasSSE41();
9337 if (IsBlendSupported)
9338 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
9342 if (SDValue Masked =
9343 lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
9346 // Use dedicated unpack instructions for masks that match their pattern.
9348 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
9351 // Try to use byte rotation instructions.
9352 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9353 if (Subtarget.hasSSSE3())
9354 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9355 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9358 // If we have direct support for blends, we should lower by decomposing into
9359 // a permute. That will be faster than the domain cross.
9360 if (IsBlendSupported)
9361 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
9364 // Try to lower by permuting the inputs into an unpack instruction.
9365 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
9369 // We implement this with SHUFPS because it can blend from two vectors.
9370 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
9371 // up the inputs, bypassing domain shift penalties that we would encur if we
9372 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
9374 return DAG.getBitcast(
9376 DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
9377 DAG.getBitcast(MVT::v4f32, V2), Mask));
9380 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
9381 /// shuffle lowering, and the most complex part.
9383 /// The lowering strategy is to try to form pairs of input lanes which are
9384 /// targeted at the same half of the final vector, and then use a dword shuffle
9385 /// to place them onto the right half, and finally unpack the paired lanes into
9386 /// their final position.
9388 /// The exact breakdown of how to form these dword pairs and align them on the
9389 /// correct sides is really tricky. See the comments within the function for
9390 /// more of the details.
9392 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
9393 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
9394 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
9395 /// vector, form the analogous 128-bit 8-element Mask.
9396 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
9397 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
9398 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9399 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
9400 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
9402 assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
9403 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
9404 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
9406 SmallVector<int, 4> LoInputs;
9407 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
9408 [](int M) { return M >= 0; });
9409 std::sort(LoInputs.begin(), LoInputs.end());
9410 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
9411 SmallVector<int, 4> HiInputs;
9412 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
9413 [](int M) { return M >= 0; });
9414 std::sort(HiInputs.begin(), HiInputs.end());
9415 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
9417 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
9418 int NumHToL = LoInputs.size() - NumLToL;
9420 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
9421 int NumHToH = HiInputs.size() - NumLToH;
9422 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
9423 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
9424 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
9425 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
9427 // If we are splatting two values from one half - one to each half, then
9428 // we can shuffle that half so each is splatted to a dword, then splat those
9429 // to their respective halves.
9430 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
9432 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
9433 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
9434 V = DAG.getNode(ShufWOp, DL, VT, V,
9435 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9436 V = DAG.getBitcast(PSHUFDVT, V);
9437 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
9438 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9439 return DAG.getBitcast(VT, V);
9442 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
9443 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
9444 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
9445 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
9447 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
9448 // such inputs we can swap two of the dwords across the half mark and end up
9449 // with <=2 inputs to each half in each half. Once there, we can fall through
9450 // to the generic code below. For example:
9452 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9453 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
9455 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
9456 // and an existing 2-into-2 on the other half. In this case we may have to
9457 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
9458 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
9459 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
9460 // because any other situation (including a 3-into-1 or 1-into-3 in the other
9461 // half than the one we target for fixing) will be fixed when we re-enter this
9462 // path. We will also combine away any sequence of PSHUFD instructions that
9463 // result into a single instruction. Here is an example of the tricky case:
9465 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9466 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
9468 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
9470 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
9471 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
9473 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
9474 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
9476 // The result is fine to be handled by the generic logic.
9477 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
9478 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
9479 int AOffset, int BOffset) {
9480 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
9481 "Must call this with A having 3 or 1 inputs from the A half.");
9482 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
9483 "Must call this with B having 1 or 3 inputs from the B half.");
9484 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
9485 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
9487 bool ThreeAInputs = AToAInputs.size() == 3;
9489 // Compute the index of dword with only one word among the three inputs in
9490 // a half by taking the sum of the half with three inputs and subtracting
9491 // the sum of the actual three inputs. The difference is the remaining
9494 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
9495 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
9496 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
9497 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
9498 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
9499 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
9500 int TripleNonInputIdx =
9501 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
9502 TripleDWord = TripleNonInputIdx / 2;
9504 // We use xor with one to compute the adjacent DWord to whichever one the
9506 OneInputDWord = (OneInput / 2) ^ 1;
9508 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
9509 // and BToA inputs. If there is also such a problem with the BToB and AToB
9510 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
9511 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
9512 // is essential that we don't *create* a 3<-1 as then we might oscillate.
9513 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
9514 // Compute how many inputs will be flipped by swapping these DWords. We
9516 // to balance this to ensure we don't form a 3-1 shuffle in the other
9518 int NumFlippedAToBInputs =
9519 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
9520 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
9521 int NumFlippedBToBInputs =
9522 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
9523 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
9524 if ((NumFlippedAToBInputs == 1 &&
9525 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
9526 (NumFlippedBToBInputs == 1 &&
9527 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
9528 // We choose whether to fix the A half or B half based on whether that
9529 // half has zero flipped inputs. At zero, we may not be able to fix it
9530 // with that half. We also bias towards fixing the B half because that
9531 // will more commonly be the high half, and we have to bias one way.
9532 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
9533 ArrayRef<int> Inputs) {
9534 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
9535 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
9536 PinnedIdx ^ 1) != Inputs.end();
9537 // Determine whether the free index is in the flipped dword or the
9538 // unflipped dword based on where the pinned index is. We use this bit
9539 // in an xor to conditionally select the adjacent dword.
9540 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
9541 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9542 FixFreeIdx) != Inputs.end();
9543 if (IsFixIdxInput == IsFixFreeIdxInput)
9545 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9546 FixFreeIdx) != Inputs.end();
9547 assert(IsFixIdxInput != IsFixFreeIdxInput &&
9548 "We need to be changing the number of flipped inputs!");
9549 int PSHUFHalfMask[] = {0, 1, 2, 3};
9550 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
9551 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
9553 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9556 if (M >= 0 && M == FixIdx)
9558 else if (M >= 0 && M == FixFreeIdx)
9561 if (NumFlippedBToBInputs != 0) {
9563 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9564 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9566 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9567 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
9568 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9573 int PSHUFDMask[] = {0, 1, 2, 3};
9574 PSHUFDMask[ADWord] = BDWord;
9575 PSHUFDMask[BDWord] = ADWord;
9578 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9579 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9581 // Adjust the mask to match the new locations of A and B.
9583 if (M >= 0 && M/2 == ADWord)
9584 M = 2 * BDWord + M % 2;
9585 else if (M >= 0 && M/2 == BDWord)
9586 M = 2 * ADWord + M % 2;
9588 // Recurse back into this routine to re-compute state now that this isn't
9589 // a 3 and 1 problem.
9590 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
9593 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9594 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9595 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9596 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9598 // At this point there are at most two inputs to the low and high halves from
9599 // each half. That means the inputs can always be grouped into dwords and
9600 // those dwords can then be moved to the correct half with a dword shuffle.
9601 // We use at most one low and one high word shuffle to collect these paired
9602 // inputs into dwords, and finally a dword shuffle to place them.
9603 int PSHUFLMask[4] = {-1, -1, -1, -1};
9604 int PSHUFHMask[4] = {-1, -1, -1, -1};
9605 int PSHUFDMask[4] = {-1, -1, -1, -1};
9607 // First fix the masks for all the inputs that are staying in their
9608 // original halves. This will then dictate the targets of the cross-half
9610 auto fixInPlaceInputs =
9611 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9612 MutableArrayRef<int> SourceHalfMask,
9613 MutableArrayRef<int> HalfMask, int HalfOffset) {
9614 if (InPlaceInputs.empty())
9616 if (InPlaceInputs.size() == 1) {
9617 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9618 InPlaceInputs[0] - HalfOffset;
9619 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9622 if (IncomingInputs.empty()) {
9623 // Just fix all of the in place inputs.
9624 for (int Input : InPlaceInputs) {
9625 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9626 PSHUFDMask[Input / 2] = Input / 2;
9631 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9632 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9633 InPlaceInputs[0] - HalfOffset;
9634 // Put the second input next to the first so that they are packed into
9635 // a dword. We find the adjacent index by toggling the low bit.
9636 int AdjIndex = InPlaceInputs[0] ^ 1;
9637 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9638 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9639 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9641 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9642 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9644 // Now gather the cross-half inputs and place them into a free dword of
9645 // their target half.
9646 // FIXME: This operation could almost certainly be simplified dramatically to
9647 // look more like the 3-1 fixing operation.
9648 auto moveInputsToRightHalf = [&PSHUFDMask](
9649 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9650 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9651 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9653 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9654 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
9656 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9658 int LowWord = Word & ~1;
9659 int HighWord = Word | 1;
9660 return isWordClobbered(SourceHalfMask, LowWord) ||
9661 isWordClobbered(SourceHalfMask, HighWord);
9664 if (IncomingInputs.empty())
9667 if (ExistingInputs.empty()) {
9668 // Map any dwords with inputs from them into the right half.
9669 for (int Input : IncomingInputs) {
9670 // If the source half mask maps over the inputs, turn those into
9671 // swaps and use the swapped lane.
9672 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9673 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
9674 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9675 Input - SourceOffset;
9676 // We have to swap the uses in our half mask in one sweep.
9677 for (int &M : HalfMask)
9678 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9680 else if (M == Input)
9681 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9683 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9684 Input - SourceOffset &&
9685 "Previous placement doesn't match!");
9687 // Note that this correctly re-maps both when we do a swap and when
9688 // we observe the other side of the swap above. We rely on that to
9689 // avoid swapping the members of the input list directly.
9690 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9693 // Map the input's dword into the correct half.
9694 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
9695 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9697 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9699 "Previous placement doesn't match!");
9702 // And just directly shift any other-half mask elements to be same-half
9703 // as we will have mirrored the dword containing the element into the
9704 // same position within that half.
9705 for (int &M : HalfMask)
9706 if (M >= SourceOffset && M < SourceOffset + 4) {
9707 M = M - SourceOffset + DestOffset;
9708 assert(M >= 0 && "This should never wrap below zero!");
9713 // Ensure we have the input in a viable dword of its current half. This
9714 // is particularly tricky because the original position may be clobbered
9715 // by inputs being moved and *staying* in that half.
9716 if (IncomingInputs.size() == 1) {
9717 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9718 int InputFixed = std::find(std::begin(SourceHalfMask),
9719 std::end(SourceHalfMask), -1) -
9720 std::begin(SourceHalfMask) + SourceOffset;
9721 SourceHalfMask[InputFixed - SourceOffset] =
9722 IncomingInputs[0] - SourceOffset;
9723 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9725 IncomingInputs[0] = InputFixed;
9727 } else if (IncomingInputs.size() == 2) {
9728 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9729 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9730 // We have two non-adjacent or clobbered inputs we need to extract from
9731 // the source half. To do this, we need to map them into some adjacent
9732 // dword slot in the source mask.
9733 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9734 IncomingInputs[1] - SourceOffset};
9736 // If there is a free slot in the source half mask adjacent to one of
9737 // the inputs, place the other input in it. We use (Index XOR 1) to
9738 // compute an adjacent index.
9739 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9740 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
9741 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9742 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9743 InputsFixed[1] = InputsFixed[0] ^ 1;
9744 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9745 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
9746 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9747 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9748 InputsFixed[0] = InputsFixed[1] ^ 1;
9749 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
9750 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
9751 // The two inputs are in the same DWord but it is clobbered and the
9752 // adjacent DWord isn't used at all. Move both inputs to the free
9754 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9755 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9756 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9757 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9759 // The only way we hit this point is if there is no clobbering
9760 // (because there are no off-half inputs to this half) and there is no
9761 // free slot adjacent to one of the inputs. In this case, we have to
9762 // swap an input with a non-input.
9763 for (int i = 0; i < 4; ++i)
9764 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
9765 "We can't handle any clobbers here!");
9766 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9767 "Cannot have adjacent inputs here!");
9769 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9770 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9772 // We also have to update the final source mask in this case because
9773 // it may need to undo the above swap.
9774 for (int &M : FinalSourceHalfMask)
9775 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9776 M = InputsFixed[1] + SourceOffset;
9777 else if (M == InputsFixed[1] + SourceOffset)
9778 M = (InputsFixed[0] ^ 1) + SourceOffset;
9780 InputsFixed[1] = InputsFixed[0] ^ 1;
9783 // Point everything at the fixed inputs.
9784 for (int &M : HalfMask)
9785 if (M == IncomingInputs[0])
9786 M = InputsFixed[0] + SourceOffset;
9787 else if (M == IncomingInputs[1])
9788 M = InputsFixed[1] + SourceOffset;
9790 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9791 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9794 llvm_unreachable("Unhandled input size!");
9797 // Now hoist the DWord down to the right half.
9798 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
9799 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
9800 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9801 for (int &M : HalfMask)
9802 for (int Input : IncomingInputs)
9804 M = FreeDWord * 2 + Input % 2;
9806 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9807 /*SourceOffset*/ 4, /*DestOffset*/ 0);
9808 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9809 /*SourceOffset*/ 0, /*DestOffset*/ 4);
9811 // Now enact all the shuffles we've computed to move the inputs into their
9813 if (!isNoopShuffleMask(PSHUFLMask))
9814 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9815 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
9816 if (!isNoopShuffleMask(PSHUFHMask))
9817 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9818 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
9819 if (!isNoopShuffleMask(PSHUFDMask))
9822 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9823 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9825 // At this point, each half should contain all its inputs, and we can then
9826 // just shuffle them into their final position.
9827 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
9828 "Failed to lift all the high half inputs to the low mask!");
9829 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
9830 "Failed to lift all the low half inputs to the high mask!");
9832 // Do a half shuffle for the low mask.
9833 if (!isNoopShuffleMask(LoMask))
9834 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9835 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
9837 // Do a half shuffle with the high mask after shifting its values down.
9838 for (int &M : HiMask)
9841 if (!isNoopShuffleMask(HiMask))
9842 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9843 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
9848 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
9849 /// blend if only one input is used.
9850 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
9851 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9852 SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
9853 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9859 int Size = Mask.size();
9860 int Scale = 16 / Size;
9861 for (int i = 0; i < 16; ++i) {
9862 if (Mask[i / Scale] < 0) {
9863 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9865 const int ZeroMask = 0x80;
9866 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
9868 int V2Idx = Mask[i / Scale] < Size
9870 : (Mask[i / Scale] - Size) * Scale + i % Scale;
9871 if (Zeroable[i / Scale])
9872 V1Idx = V2Idx = ZeroMask;
9873 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
9874 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
9875 V1InUse |= (ZeroMask != V1Idx);
9876 V2InUse |= (ZeroMask != V2Idx);
9881 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9882 DAG.getBitcast(MVT::v16i8, V1),
9883 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
9885 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9886 DAG.getBitcast(MVT::v16i8, V2),
9887 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
9889 // If we need shuffled inputs from both, blend the two.
9891 if (V1InUse && V2InUse)
9892 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9894 V = V1InUse ? V1 : V2;
9896 // Cast the result back to the correct type.
9897 return DAG.getBitcast(VT, V);
9900 /// \brief Generic lowering of 8-lane i16 shuffles.
9902 /// This handles both single-input shuffles and combined shuffle/blends with
9903 /// two inputs. The single input shuffles are immediately delegated to
9904 /// a dedicated lowering routine.
9906 /// The blends are lowered in one of three fundamental ways. If there are few
9907 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9908 /// of the input is significantly cheaper when lowered as an interleaving of
9909 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9910 /// halves of the inputs separately (making them have relatively few inputs)
9911 /// and then concatenate them.
9912 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9913 SDValue V1, SDValue V2,
9914 const X86Subtarget &Subtarget,
9915 SelectionDAG &DAG) {
9916 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9917 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9918 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9920 // Whenever we can lower this as a zext, that instruction is strictly faster
9921 // than any alternative.
9922 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9923 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9926 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
9928 if (NumV2Inputs == 0) {
9929 // Check for being able to broadcast a single element.
9930 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9931 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9934 // Try to use shift instructions.
9935 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
9939 // Use dedicated unpack instructions for masks that match their pattern.
9941 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9944 // Try to use byte rotation instructions.
9945 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
9946 Mask, Subtarget, DAG))
9949 // Make a copy of the mask so it can be modified.
9950 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
9951 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
9952 MutableMask, Subtarget,
9956 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
9957 "All single-input shuffles should be canonicalized to be V1-input "
9960 // Try to use shift instructions.
9961 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
9965 // See if we can use SSE4A Extraction / Insertion.
9966 if (Subtarget.hasSSE4A())
9967 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
9970 // There are special ways we can lower some single-element blends.
9971 if (NumV2Inputs == 1)
9972 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
9973 Mask, Subtarget, DAG))
9976 // We have different paths for blend lowering, but they all must use the
9977 // *exact* same predicate.
9978 bool IsBlendSupported = Subtarget.hasSSE41();
9979 if (IsBlendSupported)
9980 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9984 if (SDValue Masked =
9985 lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9988 // Use dedicated unpack instructions for masks that match their pattern.
9990 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9993 // Try to use byte rotation instructions.
9994 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9995 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9998 if (SDValue BitBlend =
9999 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10002 // Try to lower by permuting the inputs into an unpack instruction.
10003 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10007 // If we can't directly blend but can use PSHUFB, that will be better as it
10008 // can both shuffle and set up the inefficient blend.
10009 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10010 bool V1InUse, V2InUse;
10011 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
10015 // We can always bit-blend if we have to so the fallback strategy is to
10016 // decompose into single-input permutes and blends.
10017 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10021 /// \brief Check whether a compaction lowering can be done by dropping even
10022 /// elements and compute how many times even elements must be dropped.
10024 /// This handles shuffles which take every Nth element where N is a power of
10025 /// two. Example shuffle masks:
10027 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10028 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10029 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10030 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10031 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10032 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10034 /// Any of these lanes can of course be undef.
10036 /// This routine only supports N <= 3.
10037 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10040 /// \returns N above, or the number of times even elements must be dropped if
10041 /// there is such a number. Otherwise returns zero.
10042 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10043 bool IsSingleInput) {
10044 // The modulus for the shuffle vector entries is based on whether this is
10045 // a single input or not.
10046 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10047 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10048 "We should only be called with masks with a power-of-2 size!");
10050 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10052 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10053 // and 2^3 simultaneously. This is because we may have ambiguity with
10054 // partially undef inputs.
10055 bool ViableForN[3] = {true, true, true};
10057 for (int i = 0, e = Mask.size(); i < e; ++i) {
10058 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10063 bool IsAnyViable = false;
10064 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10065 if (ViableForN[j]) {
10066 uint64_t N = j + 1;
10068 // The shuffle mask must be equal to (i * 2^N) % M.
10069 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10070 IsAnyViable = true;
10072 ViableForN[j] = false;
10074 // Early exit if we exhaust the possible powers of two.
10079 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10083 // Return 0 as there is no viable power of two.
10087 /// \brief Generic lowering of v16i8 shuffles.
10089 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10090 /// detect any complexity reducing interleaving. If that doesn't help, it uses
10091 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10092 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10094 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10095 SDValue V1, SDValue V2,
10096 const X86Subtarget &Subtarget,
10097 SelectionDAG &DAG) {
10098 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10099 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10100 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10102 // Try to use shift instructions.
10103 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10107 // Try to use byte rotation instructions.
10108 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10109 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10112 // Try to use a zext lowering.
10113 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10114 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10117 // See if we can use SSE4A Extraction / Insertion.
10118 if (Subtarget.hasSSE4A())
10119 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
10122 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10124 // For single-input shuffles, there are some nicer lowering tricks we can use.
10125 if (NumV2Elements == 0) {
10126 // Check for being able to broadcast a single element.
10127 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10128 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10131 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10132 // Notably, this handles splat and partial-splat shuffles more efficiently.
10133 // However, it only makes sense if the pre-duplication shuffle simplifies
10134 // things significantly. Currently, this means we need to be able to
10135 // express the pre-duplication shuffle as an i16 shuffle.
10137 // FIXME: We should check for other patterns which can be widened into an
10138 // i16 shuffle as well.
10139 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10140 for (int i = 0; i < 16; i += 2)
10141 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10146 auto tryToWidenViaDuplication = [&]() -> SDValue {
10147 if (!canWidenViaDuplication(Mask))
10149 SmallVector<int, 4> LoInputs;
10150 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10151 [](int M) { return M >= 0 && M < 8; });
10152 std::sort(LoInputs.begin(), LoInputs.end());
10153 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10155 SmallVector<int, 4> HiInputs;
10156 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10157 [](int M) { return M >= 8; });
10158 std::sort(HiInputs.begin(), HiInputs.end());
10159 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10162 bool TargetLo = LoInputs.size() >= HiInputs.size();
10163 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10164 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
10166 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
10167 SmallDenseMap<int, int, 8> LaneMap;
10168 for (int I : InPlaceInputs) {
10169 PreDupI16Shuffle[I/2] = I/2;
10172 int j = TargetLo ? 0 : 4, je = j + 4;
10173 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
10174 // Check if j is already a shuffle of this input. This happens when
10175 // there are two adjacent bytes after we move the low one.
10176 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
10177 // If we haven't yet mapped the input, search for a slot into which
10179 while (j < je && PreDupI16Shuffle[j] >= 0)
10183 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
10186 // Map this input with the i16 shuffle.
10187 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
10190 // Update the lane map based on the mapping we ended up with.
10191 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
10193 V1 = DAG.getBitcast(
10195 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10196 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
10198 // Unpack the bytes to form the i16s that will be shuffled into place.
10199 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10200 MVT::v16i8, V1, V1);
10202 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10203 for (int i = 0; i < 16; ++i)
10204 if (Mask[i] >= 0) {
10205 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
10206 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
10207 if (PostDupI16Shuffle[i / 2] < 0)
10208 PostDupI16Shuffle[i / 2] = MappedMask;
10210 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
10211 "Conflicting entrties in the original shuffle!");
10213 return DAG.getBitcast(
10215 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10216 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
10218 if (SDValue V = tryToWidenViaDuplication())
10222 if (SDValue Masked =
10223 lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
10226 // Use dedicated unpack instructions for masks that match their pattern.
10228 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
10231 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
10232 // with PSHUFB. It is important to do this before we attempt to generate any
10233 // blends but after all of the single-input lowerings. If the single input
10234 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
10235 // want to preserve that and we can DAG combine any longer sequences into
10236 // a PSHUFB in the end. But once we start blending from multiple inputs,
10237 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
10238 // and there are *very* few patterns that would actually be faster than the
10239 // PSHUFB approach because of its ability to zero lanes.
10241 // FIXME: The only exceptions to the above are blends which are exact
10242 // interleavings with direct instructions supporting them. We currently don't
10243 // handle those well here.
10244 if (Subtarget.hasSSSE3()) {
10245 bool V1InUse = false;
10246 bool V2InUse = false;
10248 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
10249 DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
10251 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
10252 // do so. This avoids using them to handle blends-with-zero which is
10253 // important as a single pshufb is significantly faster for that.
10254 if (V1InUse && V2InUse) {
10255 if (Subtarget.hasSSE41())
10256 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
10257 Mask, Subtarget, DAG))
10260 // We can use an unpack to do the blending rather than an or in some
10261 // cases. Even though the or may be (very minorly) more efficient, we
10262 // preference this lowering because there are common cases where part of
10263 // the complexity of the shuffles goes away when we do the final blend as
10265 // FIXME: It might be worth trying to detect if the unpack-feeding
10266 // shuffles will both be pshufb, in which case we shouldn't bother with
10268 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10269 DL, MVT::v16i8, V1, V2, Mask, DAG))
10276 // There are special ways we can lower some single-element blends.
10277 if (NumV2Elements == 1)
10278 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
10279 Mask, Subtarget, DAG))
10282 if (SDValue BitBlend =
10283 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
10286 // Check whether a compaction lowering can be done. This handles shuffles
10287 // which take every Nth element for some even N. See the helper function for
10290 // We special case these as they can be particularly efficiently handled with
10291 // the PACKUSB instruction on x86 and they show up in common patterns of
10292 // rearranging bytes to truncate wide elements.
10293 bool IsSingleInput = V2.isUndef();
10294 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
10295 // NumEvenDrops is the power of two stride of the elements. Another way of
10296 // thinking about it is that we need to drop the even elements this many
10297 // times to get the original input.
10299 // First we need to zero all the dropped bytes.
10300 assert(NumEvenDrops <= 3 &&
10301 "No support for dropping even elements more than 3 times.");
10302 // We use the mask type to pick which bytes are preserved based on how many
10303 // elements are dropped.
10304 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
10305 SDValue ByteClearMask = DAG.getBitcast(
10306 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
10307 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
10308 if (!IsSingleInput)
10309 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
10311 // Now pack things back together.
10312 V1 = DAG.getBitcast(MVT::v8i16, V1);
10313 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
10314 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
10315 for (int i = 1; i < NumEvenDrops; ++i) {
10316 Result = DAG.getBitcast(MVT::v8i16, Result);
10317 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
10323 // Handle multi-input cases by blending single-input shuffles.
10324 if (NumV2Elements > 0)
10325 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
10328 // The fallback path for single-input shuffles widens this into two v8i16
10329 // vectors with unpacks, shuffles those, and then pulls them back together
10333 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10334 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10335 for (int i = 0; i < 16; ++i)
10337 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
10339 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
10341 SDValue VLoHalf, VHiHalf;
10342 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
10343 // them out and avoid using UNPCK{L,H} to extract the elements of V as
10345 if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
10346 [](int M) { return M >= 0 && M % 2 == 1; }) &&
10347 std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
10348 [](int M) { return M >= 0 && M % 2 == 1; })) {
10349 // Use a mask to drop the high bytes.
10350 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
10351 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
10352 DAG.getConstant(0x00FF, DL, MVT::v8i16));
10354 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
10355 VHiHalf = DAG.getUNDEF(MVT::v8i16);
10357 // Squash the masks to point directly into VLoHalf.
10358 for (int &M : LoBlendMask)
10361 for (int &M : HiBlendMask)
10365 // Otherwise just unpack the low half of V into VLoHalf and the high half into
10366 // VHiHalf so that we can blend them as i16s.
10367 VLoHalf = DAG.getBitcast(
10368 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
10369 VHiHalf = DAG.getBitcast(
10370 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
10373 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
10374 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
10376 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
10379 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
10381 /// This routine breaks down the specific type of 128-bit shuffle and
10382 /// dispatches to the lowering routines accordingly.
10383 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10384 MVT VT, SDValue V1, SDValue V2,
10385 const X86Subtarget &Subtarget,
10386 SelectionDAG &DAG) {
10387 switch (VT.SimpleTy) {
10389 return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10391 return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10393 return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10395 return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10397 return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10399 return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10402 llvm_unreachable("Unimplemented!");
10406 /// \brief Helper function to test whether a shuffle mask could be
10407 /// simplified by widening the elements being shuffled.
10409 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10410 /// leaves it in an unspecified state.
10412 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10413 /// shuffle masks. The latter have the special property of a '-2' representing
10414 /// a zero-ed lane of a vector.
10415 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10416 SmallVectorImpl<int> &WidenedMask) {
10417 WidenedMask.assign(Mask.size() / 2, 0);
10418 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10419 // If both elements are undef, its trivial.
10420 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10421 WidenedMask[i/2] = SM_SentinelUndef;
10425 // Check for an undef mask and a mask value properly aligned to fit with
10426 // a pair of values. If we find such a case, use the non-undef mask's value.
10427 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10428 WidenedMask[i/2] = Mask[i + 1] / 2;
10431 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10432 WidenedMask[i/2] = Mask[i] / 2;
10436 // When zeroing, we need to spread the zeroing across both lanes to widen.
10437 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10438 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10439 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10440 WidenedMask[i/2] = SM_SentinelZero;
10446 // Finally check if the two mask values are adjacent and aligned with
10448 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10449 WidenedMask[i/2] = Mask[i] / 2;
10453 // Otherwise we can't safely widen the elements used in this shuffle.
10456 assert(WidenedMask.size() == Mask.size() / 2 &&
10457 "Incorrect size of mask after widening the elements!");
10462 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
10464 /// This routine just extracts two subvectors, shuffles them independently, and
10465 /// then concatenates them back together. This should work effectively with all
10466 /// AVX vector shuffle types.
10467 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10468 SDValue V2, ArrayRef<int> Mask,
10469 SelectionDAG &DAG) {
10470 assert(VT.getSizeInBits() >= 256 &&
10471 "Only for 256-bit or wider vector shuffles!");
10472 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10473 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10475 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10476 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10478 int NumElements = VT.getVectorNumElements();
10479 int SplitNumElements = NumElements / 2;
10480 MVT ScalarVT = VT.getVectorElementType();
10481 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10483 // Rather than splitting build-vectors, just build two narrower build
10484 // vectors. This helps shuffling with splats and zeros.
10485 auto SplitVector = [&](SDValue V) {
10486 V = peekThroughBitcasts(V);
10488 MVT OrigVT = V.getSimpleValueType();
10489 int OrigNumElements = OrigVT.getVectorNumElements();
10490 int OrigSplitNumElements = OrigNumElements / 2;
10491 MVT OrigScalarVT = OrigVT.getVectorElementType();
10492 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
10496 auto *BV = dyn_cast<BuildVectorSDNode>(V);
10498 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10499 DAG.getIntPtrConstant(0, DL));
10500 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10501 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
10504 SmallVector<SDValue, 16> LoOps, HiOps;
10505 for (int i = 0; i < OrigSplitNumElements; ++i) {
10506 LoOps.push_back(BV->getOperand(i));
10507 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
10509 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
10510 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
10512 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
10513 DAG.getBitcast(SplitVT, HiV));
10516 SDValue LoV1, HiV1, LoV2, HiV2;
10517 std::tie(LoV1, HiV1) = SplitVector(V1);
10518 std::tie(LoV2, HiV2) = SplitVector(V2);
10520 // Now create two 4-way blends of these half-width vectors.
10521 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10522 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10523 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
10524 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
10525 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
10526 for (int i = 0; i < SplitNumElements; ++i) {
10527 int M = HalfMask[i];
10528 if (M >= NumElements) {
10529 if (M >= NumElements + SplitNumElements)
10533 V2BlendMask[i] = M - NumElements;
10534 BlendMask[i] = SplitNumElements + i;
10535 } else if (M >= 0) {
10536 if (M >= SplitNumElements)
10540 V1BlendMask[i] = M;
10545 // Because the lowering happens after all combining takes place, we need to
10546 // manually combine these blend masks as much as possible so that we create
10547 // a minimal number of high-level vector shuffle nodes.
10549 // First try just blending the halves of V1 or V2.
10550 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10551 return DAG.getUNDEF(SplitVT);
10552 if (!UseLoV2 && !UseHiV2)
10553 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10554 if (!UseLoV1 && !UseHiV1)
10555 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10557 SDValue V1Blend, V2Blend;
10558 if (UseLoV1 && UseHiV1) {
10560 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10562 // We only use half of V1 so map the usage down into the final blend mask.
10563 V1Blend = UseLoV1 ? LoV1 : HiV1;
10564 for (int i = 0; i < SplitNumElements; ++i)
10565 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10566 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10568 if (UseLoV2 && UseHiV2) {
10570 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10572 // We only use half of V2 so map the usage down into the final blend mask.
10573 V2Blend = UseLoV2 ? LoV2 : HiV2;
10574 for (int i = 0; i < SplitNumElements; ++i)
10575 if (BlendMask[i] >= SplitNumElements)
10576 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10578 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10580 SDValue Lo = HalfBlend(LoMask);
10581 SDValue Hi = HalfBlend(HiMask);
10582 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10585 /// \brief Either split a vector in halves or decompose the shuffles and the
10588 /// This is provided as a good fallback for many lowerings of non-single-input
10589 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10590 /// between splitting the shuffle into 128-bit components and stitching those
10591 /// back together vs. extracting the single-input shuffles and blending those
10593 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
10594 SDValue V1, SDValue V2,
10595 ArrayRef<int> Mask,
10596 SelectionDAG &DAG) {
10597 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
10598 "shuffles as it could then recurse on itself.");
10599 int Size = Mask.size();
10601 // If this can be modeled as a broadcast of two elements followed by a blend,
10602 // prefer that lowering. This is especially important because broadcasts can
10603 // often fold with memory operands.
10604 auto DoBothBroadcast = [&] {
10605 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10608 if (V2BroadcastIdx < 0)
10609 V2BroadcastIdx = M - Size;
10610 else if (M - Size != V2BroadcastIdx)
10612 } else if (M >= 0) {
10613 if (V1BroadcastIdx < 0)
10614 V1BroadcastIdx = M;
10615 else if (M != V1BroadcastIdx)
10620 if (DoBothBroadcast())
10621 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10624 // If the inputs all stem from a single 128-bit lane of each input, then we
10625 // split them rather than blending because the split will decompose to
10626 // unusually few instructions.
10627 int LaneCount = VT.getSizeInBits() / 128;
10628 int LaneSize = Size / LaneCount;
10629 SmallBitVector LaneInputs[2];
10630 LaneInputs[0].resize(LaneCount, false);
10631 LaneInputs[1].resize(LaneCount, false);
10632 for (int i = 0; i < Size; ++i)
10634 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10635 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10636 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10638 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10639 // that the decomposed single-input shuffles don't end up here.
10640 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10643 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10644 /// a permutation and blend of those lanes.
10646 /// This essentially blends the out-of-lane inputs to each lane into the lane
10647 /// from a permuted copy of the vector. This lowering strategy results in four
10648 /// instructions in the worst case for a single-input cross lane shuffle which
10649 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10650 /// of. Special cases for each particular shuffle pattern should be handled
10651 /// prior to trying this lowering.
10652 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
10653 SDValue V1, SDValue V2,
10654 ArrayRef<int> Mask,
10655 SelectionDAG &DAG) {
10656 // FIXME: This should probably be generalized for 512-bit vectors as well.
10657 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
10658 int Size = Mask.size();
10659 int LaneSize = Size / 2;
10661 // If there are only inputs from one 128-bit lane, splitting will in fact be
10662 // less expensive. The flags track whether the given lane contains an element
10663 // that crosses to another lane.
10664 bool LaneCrossing[2] = {false, false};
10665 for (int i = 0; i < Size; ++i)
10666 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10667 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10668 if (!LaneCrossing[0] || !LaneCrossing[1])
10669 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10671 assert(V2.isUndef() &&
10672 "This last part of this routine only works on single input shuffles");
10674 SmallVector<int, 32> FlippedBlendMask(Size);
10675 for (int i = 0; i < Size; ++i)
10676 FlippedBlendMask[i] =
10677 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10679 : Mask[i] % LaneSize +
10680 (i / LaneSize) * LaneSize + Size);
10682 // Flip the vector, and blend the results which should now be in-lane. The
10683 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10684 // 5 for the high source. The value 3 selects the high half of source 2 and
10685 // the value 2 selects the low half of source 2. We only use source 2 to
10686 // allow folding it into a memory operand.
10687 unsigned PERMMask = 3 | 2 << 4;
10688 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10689 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
10690 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10693 /// \brief Handle lowering 2-lane 128-bit shuffles.
10694 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10695 SDValue V2, ArrayRef<int> Mask,
10696 const X86Subtarget &Subtarget,
10697 SelectionDAG &DAG) {
10698 // TODO: If minimizing size and one of the inputs is a zero vector and the
10699 // the zero vector has only one use, we could use a VPERM2X128 to save the
10700 // instruction bytes needed to explicitly generate the zero vector.
10702 // Blends are faster and handle all the non-lane-crossing cases.
10703 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10707 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
10708 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
10710 // If either input operand is a zero vector, use VPERM2X128 because its mask
10711 // allows us to replace the zero input with an implicit zero.
10712 if (!IsV1Zero && !IsV2Zero) {
10713 // Check for patterns which can be matched with a single insert of a 128-bit
10715 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
10716 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
10717 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
10718 if (Subtarget.hasAVX2() && V2.isUndef())
10721 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10722 VT.getVectorNumElements() / 2);
10723 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10724 DAG.getIntPtrConstant(0, DL));
10725 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10726 OnlyUsesV1 ? V1 : V2,
10727 DAG.getIntPtrConstant(0, DL));
10728 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10732 // Otherwise form a 128-bit permutation. After accounting for undefs,
10733 // convert the 64-bit shuffle mask selection values into 128-bit
10734 // selection bits by dividing the indexes by 2 and shifting into positions
10735 // defined by a vperm2*128 instruction's immediate control byte.
10737 // The immediate permute control byte looks like this:
10738 // [1:0] - select 128 bits from sources for low half of destination
10740 // [3] - zero low half of destination
10741 // [5:4] - select 128 bits from sources for high half of destination
10743 // [7] - zero high half of destination
10745 int MaskLO = Mask[0];
10746 if (MaskLO == SM_SentinelUndef)
10747 MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
10749 int MaskHI = Mask[2];
10750 if (MaskHI == SM_SentinelUndef)
10751 MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
10753 unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
10755 // If either input is a zero vector, replace it with an undef input.
10756 // Shuffle mask values < 4 are selecting elements of V1.
10757 // Shuffle mask values >= 4 are selecting elements of V2.
10758 // Adjust each half of the permute mask by clearing the half that was
10759 // selecting the zero vector and setting the zero mask bit.
10761 V1 = DAG.getUNDEF(VT);
10763 PermMask = (PermMask & 0xf0) | 0x08;
10765 PermMask = (PermMask & 0x0f) | 0x80;
10768 V2 = DAG.getUNDEF(VT);
10770 PermMask = (PermMask & 0xf0) | 0x08;
10772 PermMask = (PermMask & 0x0f) | 0x80;
10775 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10776 DAG.getConstant(PermMask, DL, MVT::i8));
10779 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10780 /// shuffling each lane.
10782 /// This will only succeed when the result of fixing the 128-bit lanes results
10783 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10784 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10785 /// the lane crosses early and then use simpler shuffles within each lane.
10787 /// FIXME: It might be worthwhile at some point to support this without
10788 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10789 /// in x86 only floating point has interesting non-repeating shuffles, and even
10790 /// those are still *marginally* more expensive.
10791 static SDValue lowerVectorShuffleByMerging128BitLanes(
10792 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10793 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10794 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
10796 int Size = Mask.size();
10797 int LaneSize = 128 / VT.getScalarSizeInBits();
10798 int NumLanes = Size / LaneSize;
10799 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10801 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10802 // check whether the in-128-bit lane shuffles share a repeating pattern.
10803 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
10804 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
10805 for (int i = 0; i < Size; ++i) {
10809 int j = i / LaneSize;
10811 if (Lanes[j] < 0) {
10812 // First entry we've seen for this lane.
10813 Lanes[j] = Mask[i] / LaneSize;
10814 } else if (Lanes[j] != Mask[i] / LaneSize) {
10815 // This doesn't match the lane selected previously!
10819 // Check that within each lane we have a consistent shuffle mask.
10820 int k = i % LaneSize;
10821 if (InLaneMask[k] < 0) {
10822 InLaneMask[k] = Mask[i] % LaneSize;
10823 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10824 // This doesn't fit a repeating in-lane mask.
10829 // First shuffle the lanes into place.
10830 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10831 VT.getSizeInBits() / 64);
10832 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
10833 for (int i = 0; i < NumLanes; ++i)
10834 if (Lanes[i] >= 0) {
10835 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10836 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10839 V1 = DAG.getBitcast(LaneVT, V1);
10840 V2 = DAG.getBitcast(LaneVT, V2);
10841 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10843 // Cast it back to the type we actually want.
10844 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
10846 // Now do a simple shuffle that isn't lane crossing.
10847 SmallVector<int, 8> NewMask((unsigned)Size, -1);
10848 for (int i = 0; i < Size; ++i)
10850 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10851 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10852 "Must not introduce lane crosses at this point!");
10854 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10857 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
10858 /// This allows for fast cases such as subvector extraction/insertion
10859 /// or shuffling smaller vector types which can lower more efficiently.
10860 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
10861 SDValue V1, SDValue V2,
10862 ArrayRef<int> Mask,
10863 const X86Subtarget &Subtarget,
10864 SelectionDAG &DAG) {
10865 assert(VT.is256BitVector() && "Expected 256-bit vector");
10867 unsigned NumElts = VT.getVectorNumElements();
10868 unsigned HalfNumElts = NumElts / 2;
10869 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
10871 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
10872 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
10873 if (!UndefLower && !UndefUpper)
10876 // Upper half is undef and lower half is whole upper subvector.
10877 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
10879 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
10880 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10881 DAG.getIntPtrConstant(HalfNumElts, DL));
10882 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10883 DAG.getIntPtrConstant(0, DL));
10886 // Lower half is undef and upper half is whole lower subvector.
10887 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
10889 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
10890 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10891 DAG.getIntPtrConstant(0, DL));
10892 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10893 DAG.getIntPtrConstant(HalfNumElts, DL));
10896 // If the shuffle only uses two of the four halves of the input operands,
10897 // then extract them and perform the 'half' shuffle at half width.
10898 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
10899 int HalfIdx1 = -1, HalfIdx2 = -1;
10900 SmallVector<int, 8> HalfMask(HalfNumElts);
10901 unsigned Offset = UndefLower ? HalfNumElts : 0;
10902 for (unsigned i = 0; i != HalfNumElts; ++i) {
10903 int M = Mask[i + Offset];
10909 // Determine which of the 4 half vectors this element is from.
10910 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
10911 int HalfIdx = M / HalfNumElts;
10913 // Determine the element index into its half vector source.
10914 int HalfElt = M % HalfNumElts;
10916 // We can shuffle with up to 2 half vectors, set the new 'half'
10917 // shuffle mask accordingly.
10918 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
10919 HalfMask[i] = HalfElt;
10920 HalfIdx1 = HalfIdx;
10923 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
10924 HalfMask[i] = HalfElt + HalfNumElts;
10925 HalfIdx2 = HalfIdx;
10929 // Too many half vectors referenced.
10932 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
10934 // Only shuffle the halves of the inputs when useful.
10935 int NumLowerHalves =
10936 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
10937 int NumUpperHalves =
10938 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
10940 // uuuuXXXX - don't extract uppers just to insert again.
10941 if (UndefLower && NumUpperHalves != 0)
10944 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
10945 if (UndefUpper && NumUpperHalves == 2)
10948 // AVX2 - XXXXuuuu - always extract lowers.
10949 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
10950 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
10951 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10953 // AVX2 supports variable 32-bit element cross-lane shuffles.
10954 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
10955 // XXXXuuuu - don't extract lowers and uppers.
10956 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
10961 auto GetHalfVector = [&](int HalfIdx) {
10963 return DAG.getUNDEF(HalfVT);
10964 SDValue V = (HalfIdx < 2 ? V1 : V2);
10965 HalfIdx = (HalfIdx % 2) * HalfNumElts;
10966 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
10967 DAG.getIntPtrConstant(HalfIdx, DL));
10970 SDValue Half1 = GetHalfVector(HalfIdx1);
10971 SDValue Half2 = GetHalfVector(HalfIdx2);
10972 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
10973 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
10974 DAG.getIntPtrConstant(Offset, DL));
10977 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10980 /// This returns true if the elements from a particular input are already in the
10981 /// slot required by the given mask and require no permutation.
10982 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10983 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10984 int Size = Mask.size();
10985 for (int i = 0; i < Size; ++i)
10986 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10992 /// Handle case where shuffle sources are coming from the same 128-bit lane and
10993 /// every lane can be represented as the same repeating mask - allowing us to
10994 /// shuffle the sources with the repeating shuffle and then permute the result
10995 /// to the destination lanes.
10996 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
10997 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10998 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10999 int NumElts = VT.getVectorNumElements();
11000 int NumLanes = VT.getSizeInBits() / 128;
11001 int NumLaneElts = NumElts / NumLanes;
11003 // On AVX2 we may be able to just shuffle the lowest elements and then
11004 // broadcast the result.
11005 if (Subtarget.hasAVX2()) {
11006 for (unsigned BroadcastSize : {16, 32, 64}) {
11007 if (BroadcastSize <= VT.getScalarSizeInBits())
11009 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11011 // Attempt to match a repeating pattern every NumBroadcastElts,
11012 // accounting for UNDEFs but only references the lowest 128-bit
11013 // lane of the inputs.
11014 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11015 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11016 for (int j = 0; j != NumBroadcastElts; ++j) {
11017 int M = Mask[i + j];
11020 int &R = RepeatMask[j];
11021 if (0 != ((M % NumElts) / NumLaneElts))
11023 if (0 <= R && R != M)
11030 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11031 if (!FindRepeatingBroadcastMask(RepeatMask))
11034 // Shuffle the (lowest) repeated elements in place for broadcast.
11035 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11037 // Shuffle the actual broadcast.
11038 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11039 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11040 for (int j = 0; j != NumBroadcastElts; ++j)
11041 BroadcastMask[i + j] = j;
11042 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11047 // Bail if the shuffle mask doesn't cross 128-bit lanes.
11048 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
11051 // Bail if we already have a repeated lane shuffle mask.
11052 SmallVector<int, 8> RepeatedShuffleMask;
11053 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11056 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11057 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11058 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11059 int NumSubLanes = NumLanes * SubLaneScale;
11060 int NumSubLaneElts = NumLaneElts / SubLaneScale;
11062 // Check that all the sources are coming from the same lane and see if we can
11063 // form a repeating shuffle mask (local to each sub-lane). At the same time,
11064 // determine the source sub-lane for each destination sub-lane.
11065 int TopSrcSubLane = -1;
11066 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11067 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
11068 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
11069 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
11071 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
11072 // Extract the sub-lane mask, check that it all comes from the same lane
11073 // and normalize the mask entries to come from the first lane.
11075 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
11076 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11077 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
11080 int Lane = (M % NumElts) / NumLaneElts;
11081 if ((0 <= SrcLane) && (SrcLane != Lane))
11084 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
11085 SubLaneMask[Elt] = LocalM;
11088 // Whole sub-lane is UNDEF.
11092 // Attempt to match against the candidate repeated sub-lane masks.
11093 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
11094 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
11095 for (int i = 0; i != NumSubLaneElts; ++i) {
11096 if (M1[i] < 0 || M2[i] < 0)
11098 if (M1[i] != M2[i])
11104 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
11105 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
11108 // Merge the sub-lane mask into the matching repeated sub-lane mask.
11109 for (int i = 0; i != NumSubLaneElts; ++i) {
11110 int M = SubLaneMask[i];
11113 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
11114 "Unexpected mask element");
11115 RepeatedSubLaneMask[i] = M;
11118 // Track the top most source sub-lane - by setting the remaining to UNDEF
11119 // we can greatly simplify shuffle matching.
11120 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
11121 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11122 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
11126 // Bail if we failed to find a matching repeated sub-lane mask.
11127 if (Dst2SrcSubLanes[DstSubLane] < 0)
11130 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11131 "Unexpected source lane");
11133 // Create a repeating shuffle mask for the entire vector.
11134 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11135 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
11136 int Lane = SubLane / SubLaneScale;
11137 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
11138 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11139 int M = RepeatedSubLaneMask[Elt];
11142 int Idx = (SubLane * NumSubLaneElts) + Elt;
11143 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
11146 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11148 // Shuffle each source sub-lane to its destination.
11149 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11150 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11151 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11152 if (SrcSubLane < 0)
11154 for (int j = 0; j != NumSubLaneElts; ++j)
11155 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11158 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11162 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11163 ArrayRef<int> Mask, SDValue V1,
11164 SDValue V2, SelectionDAG &DAG) {
11166 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
11167 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
11168 assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
11169 int NumElts = VT.getVectorNumElements();
11170 bool ShufpdMask = true;
11171 bool CommutableMask = true;
11172 unsigned Immediate = 0;
11173 for (int i = 0; i < NumElts; ++i) {
11176 int Val = (i & 6) + NumElts * (i & 1);
11177 int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
11178 if (Mask[i] < Val || Mask[i] > Val + 1)
11179 ShufpdMask = false;
11180 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
11181 CommutableMask = false;
11182 Immediate |= (Mask[i] % 2) << i;
11185 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11186 DAG.getConstant(Immediate, DL, MVT::i8));
11187 if (CommutableMask)
11188 return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11189 DAG.getConstant(Immediate, DL, MVT::i8));
11193 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
11195 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
11196 /// isn't available.
11197 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11198 SDValue V1, SDValue V2,
11199 const X86Subtarget &Subtarget,
11200 SelectionDAG &DAG) {
11201 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11202 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11203 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11205 SmallVector<int, 4> WidenedMask;
11206 if (canWidenShuffleElements(Mask, WidenedMask))
11207 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
11211 if (V2.isUndef()) {
11212 // Check for being able to broadcast a single element.
11213 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11214 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11217 // Use low duplicate instructions for masks that match their pattern.
11218 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11219 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
11221 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
11222 // Non-half-crossing single input shuffles can be lowered with an
11223 // interleaved permutation.
11224 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11225 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
11226 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
11227 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11230 // With AVX2 we have direct support for this permutation.
11231 if (Subtarget.hasAVX2())
11232 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
11233 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11235 // Try to create an in-lane repeating shuffle mask and then shuffle the
11236 // the results into the target lanes.
11237 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11238 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11241 // Otherwise, fall back.
11242 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
11246 // Use dedicated unpack instructions for masks that match their pattern.
11248 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
11251 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
11255 // Check if the blend happens to exactly fit that of SHUFPD.
11257 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
11260 // Try to create an in-lane repeating shuffle mask and then shuffle the
11261 // the results into the target lanes.
11262 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11263 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11266 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11267 // shuffle. However, if we have AVX2 and either inputs are already in place,
11268 // we will be able to shuffle even across lanes the other input in a single
11269 // instruction so skip this pattern.
11270 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11271 isShuffleMaskInputInPlace(1, Mask))))
11272 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11273 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11276 // If we have AVX2 then we always want to lower with a blend because an v4 we
11277 // can fully permute the elements.
11278 if (Subtarget.hasAVX2())
11279 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
11282 // Otherwise fall back on generic lowering.
11283 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
11286 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
11288 /// This routine is only called when we have AVX2 and thus a reasonable
11289 /// instruction set for v4i64 shuffling..
11290 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11291 SDValue V1, SDValue V2,
11292 const X86Subtarget &Subtarget,
11293 SelectionDAG &DAG) {
11294 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11295 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11296 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11297 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
11299 SmallVector<int, 4> WidenedMask;
11300 if (canWidenShuffleElements(Mask, WidenedMask))
11301 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
11305 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
11309 // Check for being able to broadcast a single element.
11310 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
11311 Mask, Subtarget, DAG))
11314 if (V2.isUndef()) {
11315 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11316 // can use lower latency instructions that will operate on both lanes.
11317 SmallVector<int, 2> RepeatedMask;
11318 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
11319 SmallVector<int, 4> PSHUFDMask;
11320 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
11321 return DAG.getBitcast(
11323 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
11324 DAG.getBitcast(MVT::v8i32, V1),
11325 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11328 // AVX2 provides a direct instruction for permuting a single input across
11330 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
11331 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11334 // Try to use shift instructions.
11335 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
11339 // Use dedicated unpack instructions for masks that match their pattern.
11341 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
11344 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11345 // shuffle. However, if we have AVX2 and either inputs are already in place,
11346 // we will be able to shuffle even across lanes the other input in a single
11347 // instruction so skip this pattern.
11348 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11349 isShuffleMaskInputInPlace(1, Mask))))
11350 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11351 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
11354 // Otherwise fall back on generic blend lowering.
11355 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
11359 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
11361 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
11362 /// isn't available.
11363 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11364 SDValue V1, SDValue V2,
11365 const X86Subtarget &Subtarget,
11366 SelectionDAG &DAG) {
11367 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11368 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11369 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11371 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
11375 // Check for being able to broadcast a single element.
11376 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
11377 Mask, Subtarget, DAG))
11380 // If the shuffle mask is repeated in each 128-bit lane, we have many more
11381 // options to efficiently lower the shuffle.
11382 SmallVector<int, 4> RepeatedMask;
11383 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
11384 assert(RepeatedMask.size() == 4 &&
11385 "Repeated masks must be half the mask width!");
11387 // Use even/odd duplicate instructions for masks that match their pattern.
11388 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11389 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
11390 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11391 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
11394 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
11395 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11397 // Use dedicated unpack instructions for masks that match their pattern.
11399 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
11402 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
11403 // have already handled any direct blends.
11404 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
11407 // Try to create an in-lane repeating shuffle mask and then shuffle the
11408 // the results into the target lanes.
11409 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11410 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11413 // If we have a single input shuffle with different shuffle patterns in the
11414 // two 128-bit lanes use the variable mask to VPERMILPS.
11415 if (V2.isUndef()) {
11416 SDValue VPermMask[8];
11417 for (int i = 0; i < 8; ++i)
11418 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11419 : DAG.getConstant(Mask[i], DL, MVT::i32);
11420 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
11421 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
11422 DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
11424 if (Subtarget.hasAVX2())
11425 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
11426 DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11428 // Otherwise, fall back.
11429 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
11433 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11435 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11436 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11439 // If we have AVX2 then we always want to lower with a blend because at v8 we
11440 // can fully permute the elements.
11441 if (Subtarget.hasAVX2())
11442 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
11445 // Otherwise fall back on generic lowering.
11446 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
11449 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
11451 /// This routine is only called when we have AVX2 and thus a reasonable
11452 /// instruction set for v8i32 shuffling..
11453 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11454 SDValue V1, SDValue V2,
11455 const X86Subtarget &Subtarget,
11456 SelectionDAG &DAG) {
11457 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11458 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11459 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11460 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
11462 // Whenever we can lower this as a zext, that instruction is strictly faster
11463 // than any alternative. It also allows us to fold memory operands into the
11464 // shuffle in many cases.
11465 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
11466 Mask, Subtarget, DAG))
11469 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
11473 // Check for being able to broadcast a single element.
11474 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
11475 Mask, Subtarget, DAG))
11478 // If the shuffle mask is repeated in each 128-bit lane we can use more
11479 // efficient instructions that mirror the shuffles across the two 128-bit
11481 SmallVector<int, 4> RepeatedMask;
11482 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
11483 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11485 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
11486 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11488 // Use dedicated unpack instructions for masks that match their pattern.
11490 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
11494 // Try to use shift instructions.
11495 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
11499 // Try to use byte rotation instructions.
11500 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11501 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11504 // Try to create an in-lane repeating shuffle mask and then shuffle the
11505 // the results into the target lanes.
11506 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11507 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11510 // If the shuffle patterns aren't repeated but it is a single input, directly
11511 // generate a cross-lane VPERMD instruction.
11512 if (V2.isUndef()) {
11513 SDValue VPermMask[8];
11514 for (int i = 0; i < 8; ++i)
11515 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11516 : DAG.getConstant(Mask[i], DL, MVT::i32);
11517 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
11518 DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11521 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11523 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11524 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11527 // Otherwise fall back on generic blend lowering.
11528 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
11532 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
11534 /// This routine is only called when we have AVX2 and thus a reasonable
11535 /// instruction set for v16i16 shuffling..
11536 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11537 SDValue V1, SDValue V2,
11538 const X86Subtarget &Subtarget,
11539 SelectionDAG &DAG) {
11540 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11541 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11542 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11543 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
11545 // Whenever we can lower this as a zext, that instruction is strictly faster
11546 // than any alternative. It also allows us to fold memory operands into the
11547 // shuffle in many cases.
11548 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
11549 Mask, Subtarget, DAG))
11552 // Check for being able to broadcast a single element.
11553 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
11554 Mask, Subtarget, DAG))
11557 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
11561 // Use dedicated unpack instructions for masks that match their pattern.
11563 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
11566 // Try to use shift instructions.
11567 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
11571 // Try to use byte rotation instructions.
11572 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11573 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11576 // Try to create an in-lane repeating shuffle mask and then shuffle the
11577 // the results into the target lanes.
11578 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11579 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11582 if (V2.isUndef()) {
11583 // There are no generalized cross-lane shuffle operations available on i16
11585 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
11586 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
11589 SmallVector<int, 8> RepeatedMask;
11590 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11591 // As this is a single-input shuffle, the repeated mask should be
11592 // a strictly valid v8i16 mask that we can pass through to the v8i16
11593 // lowering to handle even the v16 case.
11594 return lowerV8I16GeneralSingleInputVectorShuffle(
11595 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
11599 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
11600 V2, Subtarget, DAG))
11603 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11605 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11606 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11609 // Otherwise fall back on generic lowering.
11610 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
11613 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
11615 /// This routine is only called when we have AVX2 and thus a reasonable
11616 /// instruction set for v32i8 shuffling..
11617 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11618 SDValue V1, SDValue V2,
11619 const X86Subtarget &Subtarget,
11620 SelectionDAG &DAG) {
11621 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11622 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11623 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11624 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
11626 // Whenever we can lower this as a zext, that instruction is strictly faster
11627 // than any alternative. It also allows us to fold memory operands into the
11628 // shuffle in many cases.
11629 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
11630 Mask, Subtarget, DAG))
11633 // Check for being able to broadcast a single element.
11634 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
11635 Mask, Subtarget, DAG))
11638 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
11642 // Use dedicated unpack instructions for masks that match their pattern.
11644 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
11647 // Try to use shift instructions.
11648 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
11652 // Try to use byte rotation instructions.
11653 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11654 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11657 // Try to create an in-lane repeating shuffle mask and then shuffle the
11658 // the results into the target lanes.
11659 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11660 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11663 // There are no generalized cross-lane shuffle operations available on i8
11665 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
11666 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
11669 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
11670 V2, Subtarget, DAG))
11673 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11675 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11676 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11679 // Otherwise fall back on generic lowering.
11680 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
11683 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
11685 /// This routine either breaks down the specific type of a 256-bit x86 vector
11686 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
11687 /// together based on the available instructions.
11688 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11689 MVT VT, SDValue V1, SDValue V2,
11690 const X86Subtarget &Subtarget,
11691 SelectionDAG &DAG) {
11692 // If we have a single input to the zero element, insert that into V1 if we
11693 // can do so cheaply.
11694 int NumElts = VT.getVectorNumElements();
11695 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
11697 if (NumV2Elements == 1 && Mask[0] >= NumElts)
11698 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11699 DL, VT, V1, V2, Mask, Subtarget, DAG))
11702 // Handle special cases where the lower or upper half is UNDEF.
11704 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
11707 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
11708 // can check for those subtargets here and avoid much of the subtarget
11709 // querying in the per-vector-type lowering routines. With AVX1 we have
11710 // essentially *zero* ability to manipulate a 256-bit vector with integer
11711 // types. Since we'll use floating point types there eventually, just
11712 // immediately cast everything to a float and operate entirely in that domain.
11713 if (VT.isInteger() && !Subtarget.hasAVX2()) {
11714 int ElementBits = VT.getScalarSizeInBits();
11715 if (ElementBits < 32) {
11716 // No floating point type available, if we can't use the bit operations
11717 // for masking/blending then decompose into 128-bit vectors.
11718 if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
11720 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11722 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11725 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
11726 VT.getVectorNumElements());
11727 V1 = DAG.getBitcast(FpVT, V1);
11728 V2 = DAG.getBitcast(FpVT, V2);
11729 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
11732 switch (VT.SimpleTy) {
11734 return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11736 return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11738 return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11740 return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11742 return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11744 return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11747 llvm_unreachable("Not a valid 256-bit x86 vector type!");
11751 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
11752 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
11753 ArrayRef<int> Mask, SDValue V1,
11754 SDValue V2, SelectionDAG &DAG) {
11755 assert(VT.getScalarSizeInBits() == 64 &&
11756 "Unexpected element type size for 128bit shuffle.");
11758 // To handle 256 bit vector requires VLX and most probably
11759 // function lowerV2X128VectorShuffle() is better solution.
11760 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
11762 SmallVector<int, 4> WidenedMask;
11763 if (!canWidenShuffleElements(Mask, WidenedMask))
11766 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11767 // Insure elements came from the same Op.
11768 int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
11769 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11770 if (WidenedMask[i] == SM_SentinelZero)
11772 if (WidenedMask[i] == SM_SentinelUndef)
11775 SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
11776 unsigned OpIndex = (i < Size/2) ? 0 : 1;
11777 if (Ops[OpIndex].isUndef())
11779 else if (Ops[OpIndex] != Op)
11783 // Form a 128-bit permutation.
11784 // Convert the 64-bit shuffle mask selection values into 128-bit selection
11785 // bits defined by a vshuf64x2 instruction's immediate control byte.
11786 unsigned PermMask = 0, Imm = 0;
11787 unsigned ControlBitsNum = WidenedMask.size() / 2;
11789 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11790 // Use first element in place of undef mask.
11791 Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
11792 PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
11795 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
11796 DAG.getConstant(PermMask, DL, MVT::i8));
11799 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11800 ArrayRef<int> Mask, SDValue V1,
11801 SDValue V2, SelectionDAG &DAG) {
11803 assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
11805 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11806 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11808 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11810 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11812 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
11815 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
11816 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11817 SDValue V1, SDValue V2,
11818 const X86Subtarget &Subtarget,
11819 SelectionDAG &DAG) {
11820 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11821 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11822 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11824 if (V2.isUndef()) {
11825 // Use low duplicate instructions for masks that match their pattern.
11826 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
11827 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
11829 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
11830 // Non-half-crossing single input shuffles can be lowered with an
11831 // interleaved permutation.
11832 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11833 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
11834 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
11835 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
11836 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
11837 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11840 SmallVector<int, 4> RepeatedMask;
11841 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
11842 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
11843 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11846 if (SDValue Shuf128 =
11847 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
11850 if (SDValue Unpck =
11851 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
11854 // Check if the blend happens to exactly fit that of SHUFPD.
11856 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
11859 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
11862 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11863 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
11864 SDValue V1, SDValue V2,
11865 const X86Subtarget &Subtarget,
11866 SelectionDAG &DAG) {
11867 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11868 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11869 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11871 // If the shuffle mask is repeated in each 128-bit lane, we have many more
11872 // options to efficiently lower the shuffle.
11873 SmallVector<int, 4> RepeatedMask;
11874 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
11875 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11877 // Use even/odd duplicate instructions for masks that match their pattern.
11878 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11879 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
11880 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11881 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
11884 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
11885 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11887 // Use dedicated unpack instructions for masks that match their pattern.
11888 if (SDValue Unpck =
11889 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
11892 // Otherwise, fall back to a SHUFPS sequence.
11893 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
11896 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
11899 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11900 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11901 SDValue V1, SDValue V2,
11902 const X86Subtarget &Subtarget,
11903 SelectionDAG &DAG) {
11904 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11905 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11906 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11908 if (SDValue Shuf128 =
11909 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
11912 if (V2.isUndef()) {
11913 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11914 // can use lower latency instructions that will operate on all four
11916 SmallVector<int, 2> Repeated128Mask;
11917 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
11918 SmallVector<int, 4> PSHUFDMask;
11919 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
11920 return DAG.getBitcast(
11922 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
11923 DAG.getBitcast(MVT::v16i32, V1),
11924 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11927 SmallVector<int, 4> Repeated256Mask;
11928 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
11929 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
11930 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
11933 // Try to use shift instructions.
11934 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
11938 if (SDValue Unpck =
11939 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
11942 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
11945 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11946 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11947 SDValue V1, SDValue V2,
11948 const X86Subtarget &Subtarget,
11949 SelectionDAG &DAG) {
11950 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11951 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11952 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11954 // If the shuffle mask is repeated in each 128-bit lane we can use more
11955 // efficient instructions that mirror the shuffles across the four 128-bit
11957 SmallVector<int, 4> RepeatedMask;
11958 if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
11959 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11961 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
11962 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11964 // Use dedicated unpack instructions for masks that match their pattern.
11966 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
11970 // Try to use shift instructions.
11971 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
11975 // Try to use byte rotation instructions.
11976 if (Subtarget.hasBWI())
11977 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11978 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
11981 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
11984 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11985 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11986 SDValue V1, SDValue V2,
11987 const X86Subtarget &Subtarget,
11988 SelectionDAG &DAG) {
11989 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11990 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11991 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11992 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11994 // Use dedicated unpack instructions for masks that match their pattern.
11996 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
11999 // Try to use shift instructions.
12000 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
12004 // Try to use byte rotation instructions.
12005 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12006 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
12009 if (V2.isUndef()) {
12010 SmallVector<int, 8> RepeatedMask;
12011 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
12012 // As this is a single-input shuffle, the repeated mask should be
12013 // a strictly valid v8i16 mask that we can pass through to the v8i16
12014 // lowering to handle even the v32 case.
12015 return lowerV8I16GeneralSingleInputVectorShuffle(
12016 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
12020 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
12023 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
12024 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12025 SDValue V1, SDValue V2,
12026 const X86Subtarget &Subtarget,
12027 SelectionDAG &DAG) {
12028 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12029 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12030 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
12031 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
12033 // Use dedicated unpack instructions for masks that match their pattern.
12035 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
12038 // Try to use shift instructions.
12039 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
12043 // Try to use byte rotation instructions.
12044 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12045 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12048 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
12049 V2, Subtarget, DAG))
12052 // FIXME: Implement direct support for this type!
12053 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12056 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12058 /// This routine either breaks down the specific type of a 512-bit x86 vector
12059 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
12060 /// together based on the available instructions.
12061 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12062 MVT VT, SDValue V1, SDValue V2,
12063 const X86Subtarget &Subtarget,
12064 SelectionDAG &DAG) {
12065 assert(Subtarget.hasAVX512() &&
12066 "Cannot lower 512-bit vectors w/ basic ISA!");
12068 // Check for being able to broadcast a single element.
12069 if (SDValue Broadcast =
12070 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
12073 // Dispatch to each element type for lowering. If we don't have support for
12074 // specific element type shuffles at 512 bits, immediately split them and
12075 // lower them. Each lowering routine of a given type is allowed to assume that
12076 // the requisite ISA extensions for that element type are available.
12077 switch (VT.SimpleTy) {
12079 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12081 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12083 return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12085 return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12087 return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12089 return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12092 llvm_unreachable("Not a valid 512-bit x86 vector type!");
12096 // Lower vXi1 vector shuffles.
12097 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
12098 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
12099 // vector, shuffle and then truncate it back.
12100 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12101 MVT VT, SDValue V1, SDValue V2,
12102 const X86Subtarget &Subtarget,
12103 SelectionDAG &DAG) {
12104 assert(Subtarget.hasAVX512() &&
12105 "Cannot lower 512-bit vectors w/o basic ISA!");
12107 switch (VT.SimpleTy) {
12109 llvm_unreachable("Expected a vector of i1 elements");
12111 ExtVT = MVT::v2i64;
12114 ExtVT = MVT::v4i32;
12117 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
12120 ExtVT = MVT::v16i32;
12123 ExtVT = MVT::v32i16;
12126 ExtVT = MVT::v64i8;
12130 if (ISD::isBuildVectorAllZeros(V1.getNode()))
12131 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12132 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
12133 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12135 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
12138 V2 = DAG.getUNDEF(ExtVT);
12139 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
12140 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12141 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
12142 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12144 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
12145 return DAG.getNode(ISD::TRUNCATE, DL, VT,
12146 DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
12148 /// \brief Top-level lowering for x86 vector shuffles.
12150 /// This handles decomposition, canonicalization, and lowering of all x86
12151 /// vector shuffles. Most of the specific lowering strategies are encapsulated
12152 /// above in helper routines. The canonicalization attempts to widen shuffles
12153 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
12154 /// s.t. only one of the two inputs needs to be tested, etc.
12155 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
12156 SelectionDAG &DAG) {
12157 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12158 ArrayRef<int> Mask = SVOp->getMask();
12159 SDValue V1 = Op.getOperand(0);
12160 SDValue V2 = Op.getOperand(1);
12161 MVT VT = Op.getSimpleValueType();
12162 int NumElements = VT.getVectorNumElements();
12164 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
12166 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
12167 "Can't lower MMX shuffles");
12169 bool V1IsUndef = V1.isUndef();
12170 bool V2IsUndef = V2.isUndef();
12171 if (V1IsUndef && V2IsUndef)
12172 return DAG.getUNDEF(VT);
12174 // When we create a shuffle node we put the UNDEF node to second operand,
12175 // but in some cases the first operand may be transformed to UNDEF.
12176 // In this case we should just commute the node.
12178 return DAG.getCommutedVectorShuffle(*SVOp);
12180 // Check for non-undef masks pointing at an undef vector and make the masks
12181 // undef as well. This makes it easier to match the shuffle based solely on
12185 if (M >= NumElements) {
12186 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
12187 for (int &M : NewMask)
12188 if (M >= NumElements)
12190 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
12193 // We actually see shuffles that are entirely re-arrangements of a set of
12194 // zero inputs. This mostly happens while decomposing complex shuffles into
12195 // simple ones. Directly lower these as a buildvector of zeros.
12196 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
12197 if (Zeroable.all())
12198 return getZeroVector(VT, Subtarget, DAG, DL);
12200 // Try to collapse shuffles into using a vector type with fewer elements but
12201 // wider element types. We cap this to not form integers or floating point
12202 // elements wider than 64 bits, but it might be interesting to form i128
12203 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
12204 SmallVector<int, 16> WidenedMask;
12205 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
12206 canWidenShuffleElements(Mask, WidenedMask)) {
12207 MVT NewEltVT = VT.isFloatingPoint()
12208 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
12209 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
12210 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12211 // Make sure that the new vector type is legal. For example, v2f64 isn't
12213 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12214 V1 = DAG.getBitcast(NewVT, V1);
12215 V2 = DAG.getBitcast(NewVT, V2);
12216 return DAG.getBitcast(
12217 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
12221 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
12224 ++NumUndefElements;
12225 else if (M < NumElements)
12230 // Commute the shuffle as needed such that more elements come from V1 than
12231 // V2. This allows us to match the shuffle pattern strictly on how many
12232 // elements come from V1 without handling the symmetric cases.
12233 if (NumV2Elements > NumV1Elements)
12234 return DAG.getCommutedVectorShuffle(*SVOp);
12236 assert(NumV1Elements > 0 && "No V1 indices");
12237 assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
12239 // When the number of V1 and V2 elements are the same, try to minimize the
12240 // number of uses of V2 in the low half of the vector. When that is tied,
12241 // ensure that the sum of indices for V1 is equal to or lower than the sum
12242 // indices for V2. When those are equal, try to ensure that the number of odd
12243 // indices for V1 is lower than the number of odd indices for V2.
12244 if (NumV1Elements == NumV2Elements) {
12245 int LowV1Elements = 0, LowV2Elements = 0;
12246 for (int M : Mask.slice(0, NumElements / 2))
12247 if (M >= NumElements)
12251 if (LowV2Elements > LowV1Elements)
12252 return DAG.getCommutedVectorShuffle(*SVOp);
12253 if (LowV2Elements == LowV1Elements) {
12254 int SumV1Indices = 0, SumV2Indices = 0;
12255 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12256 if (Mask[i] >= NumElements)
12258 else if (Mask[i] >= 0)
12260 if (SumV2Indices < SumV1Indices)
12261 return DAG.getCommutedVectorShuffle(*SVOp);
12262 if (SumV2Indices == SumV1Indices) {
12263 int NumV1OddIndices = 0, NumV2OddIndices = 0;
12264 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12265 if (Mask[i] >= NumElements)
12266 NumV2OddIndices += i % 2;
12267 else if (Mask[i] >= 0)
12268 NumV1OddIndices += i % 2;
12269 if (NumV2OddIndices < NumV1OddIndices)
12270 return DAG.getCommutedVectorShuffle(*SVOp);
12275 // For each vector width, delegate to a specialized lowering routine.
12276 if (VT.is128BitVector())
12277 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12279 if (VT.is256BitVector())
12280 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12282 if (VT.is512BitVector())
12283 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12286 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12288 llvm_unreachable("Unimplemented!");
12291 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
12292 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
12293 const X86Subtarget &Subtarget,
12294 SelectionDAG &DAG) {
12295 SDValue Cond = Op.getOperand(0);
12296 SDValue LHS = Op.getOperand(1);
12297 SDValue RHS = Op.getOperand(2);
12299 MVT VT = Op.getSimpleValueType();
12301 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12303 auto *CondBV = cast<BuildVectorSDNode>(Cond);
12305 // Only non-legal VSELECTs reach this lowering, convert those into generic
12306 // shuffles and re-use the shuffle lowering path for blends.
12307 SmallVector<int, 32> Mask;
12308 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
12309 SDValue CondElt = CondBV->getOperand(i);
12311 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
12314 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
12317 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12318 // A vselect where all conditions and data are constants can be optimized into
12319 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12320 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12321 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12322 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12325 // Try to lower this to a blend-style vector shuffle. This can handle all
12326 // constant condition cases.
12327 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
12330 // Variable blends are only legal from SSE4.1 onward.
12331 if (!Subtarget.hasSSE41())
12334 // Only some types will be legal on some subtargets. If we can emit a legal
12335 // VSELECT-matching blend, return Op, and but if we need to expand, return
12337 switch (Op.getSimpleValueType().SimpleTy) {
12339 // Most of the vector types have blends past SSE4.1.
12343 // The byte blends for AVX vectors were introduced only in AVX2.
12344 if (Subtarget.hasAVX2())
12351 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
12352 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12355 // FIXME: We should custom lower this by fixing the condition and using i8
12361 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12362 MVT VT = Op.getSimpleValueType();
12365 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12368 if (VT.getSizeInBits() == 8) {
12369 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12370 Op.getOperand(0), Op.getOperand(1));
12371 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12372 DAG.getValueType(VT));
12373 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12376 if (VT.getSizeInBits() == 16) {
12377 // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12378 if (isNullConstant(Op.getOperand(1)))
12379 return DAG.getNode(
12380 ISD::TRUNCATE, dl, MVT::i16,
12381 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12382 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12383 Op.getOperand(1)));
12384 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12385 Op.getOperand(0), Op.getOperand(1));
12386 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12387 DAG.getValueType(VT));
12388 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12391 if (VT == MVT::f32) {
12392 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12393 // the result back to FR32 register. It's only worth matching if the
12394 // result has a single use which is a store or a bitcast to i32. And in
12395 // the case of a store, it's not worth it if the index is a constant 0,
12396 // because a MOVSSmr can be used instead, which is smaller and faster.
12397 if (!Op.hasOneUse())
12399 SDNode *User = *Op.getNode()->use_begin();
12400 if ((User->getOpcode() != ISD::STORE ||
12401 isNullConstant(Op.getOperand(1))) &&
12402 (User->getOpcode() != ISD::BITCAST ||
12403 User->getValueType(0) != MVT::i32))
12405 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12406 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12408 return DAG.getBitcast(MVT::f32, Extract);
12411 if (VT == MVT::i32 || VT == MVT::i64) {
12412 // ExtractPS/pextrq works with constant index.
12413 if (isa<ConstantSDNode>(Op.getOperand(1)))
12419 /// Extract one bit from mask vector, like v16i1 or v8i1.
12420 /// AVX-512 feature.
12422 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12423 SDValue Vec = Op.getOperand(0);
12425 MVT VecVT = Vec.getSimpleValueType();
12426 SDValue Idx = Op.getOperand(1);
12427 MVT EltVT = Op.getSimpleValueType();
12429 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12430 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
12431 "Unexpected vector type in ExtractBitFromMaskVector");
12433 // variable index can't be handled in mask registers,
12434 // extend vector to VR512
12435 if (!isa<ConstantSDNode>(Idx)) {
12436 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
12437 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12438 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12439 ExtVT.getVectorElementType(), Ext, Idx);
12440 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12443 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12444 if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
12445 // Use kshiftlw/rw instruction.
12446 VecVT = MVT::v16i1;
12447 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
12448 DAG.getUNDEF(VecVT),
12450 DAG.getIntPtrConstant(0, dl));
12452 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
12453 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12454 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
12455 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12456 DAG.getConstant(MaxSift, dl, MVT::i8));
12457 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12458 DAG.getIntPtrConstant(0, dl));
12462 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12463 SelectionDAG &DAG) const {
12465 SDValue Vec = Op.getOperand(0);
12466 MVT VecVT = Vec.getSimpleValueType();
12467 SDValue Idx = Op.getOperand(1);
12469 if (Op.getSimpleValueType() == MVT::i1)
12470 return ExtractBitFromMaskVector(Op, DAG);
12472 if (!isa<ConstantSDNode>(Idx)) {
12473 if (VecVT.is512BitVector() ||
12474 (VecVT.is256BitVector() && Subtarget.hasInt256() &&
12475 VecVT.getVectorElementType().getSizeInBits() == 32)) {
12478 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12479 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12480 MaskEltVT.getSizeInBits());
12482 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12483 auto PtrVT = getPointerTy(DAG.getDataLayout());
12484 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12485 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
12486 DAG.getConstant(0, dl, PtrVT));
12487 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12488 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
12489 DAG.getConstant(0, dl, PtrVT));
12494 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12496 // If this is a 256-bit vector result, first extract the 128-bit vector and
12497 // then extract the element from the 128-bit vector.
12498 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12499 // Get the 128-bit vector.
12500 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
12501 MVT EltVT = VecVT.getVectorElementType();
12503 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12504 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
12506 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
12507 // this can be done with a mask.
12508 IdxVal &= ElemsPerChunk - 1;
12509 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12510 DAG.getConstant(IdxVal, dl, MVT::i32));
12513 assert(VecVT.is128BitVector() && "Unexpected vector length");
12515 if (Subtarget.hasSSE41())
12516 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
12519 MVT VT = Op.getSimpleValueType();
12520 // TODO: handle v16i8.
12521 if (VT.getSizeInBits() == 16) {
12523 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12524 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12525 DAG.getBitcast(MVT::v4i32, Vec), Idx));
12527 // Transform it so it match pextrw which produces a 32-bit result.
12528 MVT EltVT = MVT::i32;
12529 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
12530 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12531 DAG.getValueType(VT));
12532 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12535 if (VT.getSizeInBits() == 32) {
12539 // SHUFPS the element to the lowest double word, then movss.
12540 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
12541 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12542 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12543 DAG.getIntPtrConstant(0, dl));
12546 if (VT.getSizeInBits() == 64) {
12547 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12548 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12549 // to match extract_elt for f64.
12553 // UNPCKHPD the element to the lowest double word, then movsd.
12554 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12555 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12556 int Mask[2] = { 1, -1 };
12557 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12558 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12559 DAG.getIntPtrConstant(0, dl));
12565 /// Insert one bit to mask vector, like v16i1 or v8i1.
12566 /// AVX-512 feature.
12568 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12570 SDValue Vec = Op.getOperand(0);
12571 SDValue Elt = Op.getOperand(1);
12572 SDValue Idx = Op.getOperand(2);
12573 MVT VecVT = Vec.getSimpleValueType();
12575 if (!isa<ConstantSDNode>(Idx)) {
12576 // Non constant index. Extend source and destination,
12577 // insert element and then truncate the result.
12578 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
12579 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
12580 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
12581 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
12582 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
12583 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
12586 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12587 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
12589 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12590 DAG.getConstant(IdxVal, dl, MVT::i8));
12593 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
12596 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12597 SelectionDAG &DAG) const {
12598 MVT VT = Op.getSimpleValueType();
12599 MVT EltVT = VT.getVectorElementType();
12600 unsigned NumElts = VT.getVectorNumElements();
12602 if (EltVT == MVT::i1)
12603 return InsertBitToMaskVector(Op, DAG);
12606 SDValue N0 = Op.getOperand(0);
12607 SDValue N1 = Op.getOperand(1);
12608 SDValue N2 = Op.getOperand(2);
12609 if (!isa<ConstantSDNode>(N2))
12611 auto *N2C = cast<ConstantSDNode>(N2);
12612 unsigned IdxVal = N2C->getZExtValue();
12614 // If we are clearing out a element, we do this more efficiently with a
12615 // blend shuffle than a costly integer insertion.
12616 // TODO: would other rematerializable values (e.g. allbits) benefit as well?
12617 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
12618 // be beneficial if we are inserting several zeros and can combine the masks.
12619 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
12620 SmallVector<int, 8> ClearMask;
12621 for (unsigned i = 0; i != NumElts; ++i)
12622 ClearMask.push_back(i == IdxVal ? i + NumElts : i);
12623 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
12624 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
12627 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
12628 // into that, and then insert the subvector back into the result.
12629 if (VT.is256BitVector() || VT.is512BitVector()) {
12630 // With a 256-bit vector, we can insert into the zero element efficiently
12631 // using a blend if we have AVX or AVX2 and the right data type.
12632 if (VT.is256BitVector() && IdxVal == 0) {
12633 // TODO: It is worthwhile to cast integer to floating point and back
12634 // and incur a domain crossing penalty if that's what we'll end up
12635 // doing anyway after extracting to a 128-bit vector.
12636 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12637 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
12638 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
12639 N2 = DAG.getIntPtrConstant(1, dl);
12640 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
12644 // Get the desired 128-bit vector chunk.
12645 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
12647 // Insert the element into the desired chunk.
12648 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
12649 assert(isPowerOf2_32(NumEltsIn128));
12650 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
12651 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
12653 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
12654 DAG.getConstant(IdxIn128, dl, MVT::i32));
12656 // Insert the changed part back into the bigger vector
12657 return insert128BitVector(N0, V, IdxVal, DAG, dl);
12659 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
12661 if (Subtarget.hasSSE41()) {
12662 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
12664 if (VT == MVT::v8i16) {
12665 Opc = X86ISD::PINSRW;
12667 assert(VT == MVT::v16i8);
12668 Opc = X86ISD::PINSRB;
12671 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
12673 if (N1.getValueType() != MVT::i32)
12674 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12675 if (N2.getValueType() != MVT::i32)
12676 N2 = DAG.getIntPtrConstant(IdxVal, dl);
12677 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
12680 if (EltVT == MVT::f32) {
12681 // Bits [7:6] of the constant are the source select. This will always be
12682 // zero here. The DAG Combiner may combine an extract_elt index into
12683 // these bits. For example (insert (extract, 3), 2) could be matched by
12684 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
12685 // Bits [5:4] of the constant are the destination select. This is the
12686 // value of the incoming immediate.
12687 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
12688 // combine either bitwise AND or insert of float 0.0 to set these bits.
12690 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
12691 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
12692 // If this is an insertion of 32-bits into the low 32-bits of
12693 // a vector, we prefer to generate a blend with immediate rather
12694 // than an insertps. Blends are simpler operations in hardware and so
12695 // will always have equal or better performance than insertps.
12696 // But if optimizing for size and there's a load folding opportunity,
12697 // generate insertps because blendps does not have a 32-bit memory
12699 N2 = DAG.getIntPtrConstant(1, dl);
12700 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12701 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
12703 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
12704 // Create this as a scalar to vector..
12705 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12706 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
12709 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
12710 // PINSR* works with constant index.
12715 if (EltVT == MVT::i8)
12718 if (EltVT.getSizeInBits() == 16) {
12719 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
12720 // as its second argument.
12721 if (N1.getValueType() != MVT::i32)
12722 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12723 if (N2.getValueType() != MVT::i32)
12724 N2 = DAG.getIntPtrConstant(IdxVal, dl);
12725 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
12730 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
12732 MVT OpVT = Op.getSimpleValueType();
12734 // If this is a 256-bit vector result, first insert into a 128-bit
12735 // vector and then insert into the 256-bit vector.
12736 if (!OpVT.is128BitVector()) {
12737 // Insert into a 128-bit vector.
12738 unsigned SizeFactor = OpVT.getSizeInBits()/128;
12739 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
12740 OpVT.getVectorNumElements() / SizeFactor);
12742 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
12744 // Insert the 128-bit vector.
12745 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
12748 if (OpVT == MVT::v1i64 &&
12749 Op.getOperand(0).getValueType() == MVT::i64)
12750 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
12752 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
12753 assert(OpVT.is128BitVector() && "Expected an SSE type!");
12754 return DAG.getBitcast(
12755 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
12758 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
12759 // a simple subregister reference or explicit instructions to grab
12760 // upper bits of a vector.
12761 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12762 SelectionDAG &DAG) {
12764 SDValue In = Op.getOperand(0);
12765 SDValue Idx = Op.getOperand(1);
12766 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12767 MVT ResVT = Op.getSimpleValueType();
12768 MVT InVT = In.getSimpleValueType();
12770 if (Subtarget.hasFp256()) {
12771 if (ResVT.is128BitVector() &&
12772 (InVT.is256BitVector() || InVT.is512BitVector()) &&
12773 isa<ConstantSDNode>(Idx)) {
12774 return extract128BitVector(In, IdxVal, DAG, dl);
12776 if (ResVT.is256BitVector() && InVT.is512BitVector() &&
12777 isa<ConstantSDNode>(Idx)) {
12778 return extract256BitVector(In, IdxVal, DAG, dl);
12784 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
12785 // simple superregister reference or explicit instructions to insert
12786 // the upper bits of a vector.
12787 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12788 SelectionDAG &DAG) {
12789 if (!Subtarget.hasAVX())
12793 SDValue Vec = Op.getOperand(0);
12794 SDValue SubVec = Op.getOperand(1);
12795 SDValue Idx = Op.getOperand(2);
12797 if (!isa<ConstantSDNode>(Idx))
12800 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12801 MVT OpVT = Op.getSimpleValueType();
12802 MVT SubVecVT = SubVec.getSimpleValueType();
12804 // Fold two 16-byte subvector loads into one 32-byte load:
12805 // (insert_subvector (insert_subvector undef, (load addr), 0),
12806 // (load addr + 16), Elts/2)
12808 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
12809 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
12810 OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
12811 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
12812 if (Idx2 && Idx2->getZExtValue() == 0) {
12813 // If needed, look through bitcasts to get to the load.
12814 SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
12815 if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
12817 unsigned Alignment = FirstLd->getAlignment();
12818 unsigned AS = FirstLd->getAddressSpace();
12819 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
12820 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
12821 OpVT, AS, Alignment, &Fast) && Fast) {
12822 SDValue Ops[] = { SubVec2, SubVec };
12823 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
12830 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
12831 SubVecVT.is128BitVector())
12832 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
12834 if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
12835 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
12837 if (OpVT.getVectorElementType() == MVT::i1)
12838 return insert1BitVector(Op, DAG, Subtarget);
12843 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
12844 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
12845 // one of the above mentioned nodes. It has to be wrapped because otherwise
12846 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
12847 // be used to form addressing mode. These wrapped nodes will be selected
12850 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
12851 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12853 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12854 // global base reg.
12855 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12856 unsigned WrapperKind = X86ISD::Wrapper;
12857 CodeModel::Model M = DAG.getTarget().getCodeModel();
12859 if (Subtarget.isPICStyleRIPRel() &&
12860 (M == CodeModel::Small || M == CodeModel::Kernel))
12861 WrapperKind = X86ISD::WrapperRIP;
12863 auto PtrVT = getPointerTy(DAG.getDataLayout());
12864 SDValue Result = DAG.getTargetConstantPool(
12865 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
12867 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12868 // With PIC, the address is actually $g + Offset.
12871 DAG.getNode(ISD::ADD, DL, PtrVT,
12872 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12878 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
12879 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12881 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12882 // global base reg.
12883 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12884 unsigned WrapperKind = X86ISD::Wrapper;
12885 CodeModel::Model M = DAG.getTarget().getCodeModel();
12887 if (Subtarget.isPICStyleRIPRel() &&
12888 (M == CodeModel::Small || M == CodeModel::Kernel))
12889 WrapperKind = X86ISD::WrapperRIP;
12891 auto PtrVT = getPointerTy(DAG.getDataLayout());
12892 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
12894 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12896 // With PIC, the address is actually $g + Offset.
12899 DAG.getNode(ISD::ADD, DL, PtrVT,
12900 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12906 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
12907 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
12909 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12910 // global base reg.
12911 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
12912 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
12913 unsigned WrapperKind = X86ISD::Wrapper;
12914 CodeModel::Model M = DAG.getTarget().getCodeModel();
12916 if (Subtarget.isPICStyleRIPRel() &&
12917 (M == CodeModel::Small || M == CodeModel::Kernel))
12918 WrapperKind = X86ISD::WrapperRIP;
12920 auto PtrVT = getPointerTy(DAG.getDataLayout());
12921 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
12924 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12926 // With PIC, the address is actually $g + Offset.
12927 if (isPositionIndependent() && !Subtarget.is64Bit()) {
12929 DAG.getNode(ISD::ADD, DL, PtrVT,
12930 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12933 // For symbols that require a load from a stub to get the address, emit the
12935 if (isGlobalStubReference(OpFlag))
12936 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
12937 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
12943 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
12944 // Create the TargetBlockAddressAddress node.
12945 unsigned char OpFlags =
12946 Subtarget.classifyBlockAddressReference();
12947 CodeModel::Model M = DAG.getTarget().getCodeModel();
12948 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
12949 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
12951 auto PtrVT = getPointerTy(DAG.getDataLayout());
12952 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
12954 if (Subtarget.isPICStyleRIPRel() &&
12955 (M == CodeModel::Small || M == CodeModel::Kernel))
12956 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12958 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12960 // With PIC, the address is actually $g + Offset.
12961 if (isGlobalRelativeToPICBase(OpFlags)) {
12962 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12963 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12969 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
12970 const SDLoc &dl, int64_t Offset,
12971 SelectionDAG &DAG) const {
12972 // Create the TargetGlobalAddress node, folding in the constant
12973 // offset if it is legal.
12974 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
12975 CodeModel::Model M = DAG.getTarget().getCodeModel();
12976 auto PtrVT = getPointerTy(DAG.getDataLayout());
12978 if (OpFlags == X86II::MO_NO_FLAG &&
12979 X86::isOffsetSuitableForCodeModel(Offset, M)) {
12980 // A direct static reference to a global.
12981 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
12984 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
12987 if (Subtarget.isPICStyleRIPRel() &&
12988 (M == CodeModel::Small || M == CodeModel::Kernel))
12989 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12991 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12993 // With PIC, the address is actually $g + Offset.
12994 if (isGlobalRelativeToPICBase(OpFlags)) {
12995 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12996 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12999 // For globals that require a load from a stub to get the address, emit the
13001 if (isGlobalStubReference(OpFlags))
13002 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
13003 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13005 // If there was a non-zero offset that we didn't fold, create an explicit
13006 // addition for it.
13008 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
13009 DAG.getConstant(Offset, dl, PtrVT));
13015 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13016 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13017 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13018 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13022 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13023 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13024 unsigned char OperandFlags, bool LocalDynamic = false) {
13025 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13026 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13028 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13029 GA->getValueType(0),
13033 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13037 SDValue Ops[] = { Chain, TGA, *InFlag };
13038 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13040 SDValue Ops[] = { Chain, TGA };
13041 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13044 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13045 MFI->setAdjustsStack(true);
13046 MFI->setHasCalls(true);
13048 SDValue Flag = Chain.getValue(1);
13049 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13052 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13054 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13057 SDLoc dl(GA); // ? function entry point might be better
13058 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13059 DAG.getNode(X86ISD::GlobalBaseReg,
13060 SDLoc(), PtrVT), InFlag);
13061 InFlag = Chain.getValue(1);
13063 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13066 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13068 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13070 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13071 X86::RAX, X86II::MO_TLSGD);
13074 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13080 // Get the start address of the TLS block for this module.
13081 X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13082 .getInfo<X86MachineFunctionInfo>();
13083 MFI->incNumLocalDynamicTLSAccesses();
13087 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13088 X86II::MO_TLSLD, /*LocalDynamic=*/true);
13091 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13092 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13093 InFlag = Chain.getValue(1);
13094 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13095 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13098 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13102 unsigned char OperandFlags = X86II::MO_DTPOFF;
13103 unsigned WrapperKind = X86ISD::Wrapper;
13104 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13105 GA->getValueType(0),
13106 GA->getOffset(), OperandFlags);
13107 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13109 // Add x@dtpoff with the base.
13110 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13113 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13114 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13115 const EVT PtrVT, TLSModel::Model model,
13116 bool is64Bit, bool isPIC) {
13119 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13120 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13121 is64Bit ? 257 : 256));
13123 SDValue ThreadPointer =
13124 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
13125 MachinePointerInfo(Ptr));
13127 unsigned char OperandFlags = 0;
13128 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
13130 unsigned WrapperKind = X86ISD::Wrapper;
13131 if (model == TLSModel::LocalExec) {
13132 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13133 } else if (model == TLSModel::InitialExec) {
13135 OperandFlags = X86II::MO_GOTTPOFF;
13136 WrapperKind = X86ISD::WrapperRIP;
13138 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13141 llvm_unreachable("Unexpected model");
13144 // emit "addl x@ntpoff,%eax" (local exec)
13145 // or "addl x@indntpoff,%eax" (initial exec)
13146 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13148 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13149 GA->getOffset(), OperandFlags);
13150 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13152 if (model == TLSModel::InitialExec) {
13153 if (isPIC && !is64Bit) {
13154 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13155 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13159 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13160 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13163 // The address of the thread local variable is the add of the thread
13164 // pointer with the offset of the variable.
13165 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13169 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13171 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13173 if (DAG.getTarget().Options.EmulatedTLS)
13174 return LowerToTLSEmulatedModel(GA, DAG);
13176 const GlobalValue *GV = GA->getGlobal();
13177 auto PtrVT = getPointerTy(DAG.getDataLayout());
13178 bool PositionIndependent = isPositionIndependent();
13180 if (Subtarget.isTargetELF()) {
13181 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13183 case TLSModel::GeneralDynamic:
13184 if (Subtarget.is64Bit())
13185 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
13186 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
13187 case TLSModel::LocalDynamic:
13188 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
13189 Subtarget.is64Bit());
13190 case TLSModel::InitialExec:
13191 case TLSModel::LocalExec:
13192 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
13193 PositionIndependent);
13195 llvm_unreachable("Unknown TLS model.");
13198 if (Subtarget.isTargetDarwin()) {
13199 // Darwin only has one model of TLS. Lower to that.
13200 unsigned char OpFlag = 0;
13201 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
13202 X86ISD::WrapperRIP : X86ISD::Wrapper;
13204 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13205 // global base reg.
13206 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
13208 OpFlag = X86II::MO_TLVP_PIC_BASE;
13210 OpFlag = X86II::MO_TLVP;
13212 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13213 GA->getValueType(0),
13214 GA->getOffset(), OpFlag);
13215 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
13217 // With PIC32, the address is actually $g + Offset.
13219 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
13220 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13223 // Lowering the machine isd will make sure everything is in the right
13225 SDValue Chain = DAG.getEntryNode();
13226 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13227 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
13228 SDValue Args[] = { Chain, Offset };
13229 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13230 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
13231 DAG.getIntPtrConstant(0, DL, true),
13232 Chain.getValue(1), DL);
13234 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13235 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13236 MFI->setAdjustsStack(true);
13238 // And our return value (tls address) is in the standard call return value
13240 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
13241 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
13244 if (Subtarget.isTargetKnownWindowsMSVC() ||
13245 Subtarget.isTargetWindowsItanium() ||
13246 Subtarget.isTargetWindowsGNU()) {
13247 // Just use the implicit TLS architecture
13248 // Need to generate someting similar to:
13249 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13251 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
13252 // mov rcx, qword [rdx+rcx*8]
13253 // mov eax, .tls$:tlsvar
13254 // [rax+rcx] contains the address
13255 // Windows 64bit: gs:0x58
13256 // Windows 32bit: fs:__tls_array
13259 SDValue Chain = DAG.getEntryNode();
13261 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13262 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13263 // use its literal value of 0x2C.
13264 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
13265 ? Type::getInt8PtrTy(*DAG.getContext(),
13267 : Type::getInt32PtrTy(*DAG.getContext(),
13270 SDValue TlsArray = Subtarget.is64Bit()
13271 ? DAG.getIntPtrConstant(0x58, dl)
13272 : (Subtarget.isTargetWindowsGNU()
13273 ? DAG.getIntPtrConstant(0x2C, dl)
13274 : DAG.getExternalSymbol("_tls_array", PtrVT));
13276 SDValue ThreadPointer =
13277 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
13280 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
13281 res = ThreadPointer;
13283 // Load the _tls_index variable
13284 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
13285 if (Subtarget.is64Bit())
13286 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
13287 MachinePointerInfo(), MVT::i32);
13289 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
13291 auto &DL = DAG.getDataLayout();
13293 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
13294 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
13296 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
13299 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
13301 // Get the offset of start of .tls section
13302 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13303 GA->getValueType(0),
13304 GA->getOffset(), X86II::MO_SECREL);
13305 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
13307 // The address of the thread local variable is the add of the thread
13308 // pointer with the offset of the variable.
13309 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
13312 llvm_unreachable("TLS not implemented for this target.");
13315 /// Lower SRA_PARTS and friends, which return two i32 values
13316 /// and take a 2 x i32 value to shift plus a shift amount.
13317 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13318 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13319 MVT VT = Op.getSimpleValueType();
13320 unsigned VTBits = VT.getSizeInBits();
13322 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13323 SDValue ShOpLo = Op.getOperand(0);
13324 SDValue ShOpHi = Op.getOperand(1);
13325 SDValue ShAmt = Op.getOperand(2);
13326 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13327 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13329 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13330 DAG.getConstant(VTBits - 1, dl, MVT::i8));
13331 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13332 DAG.getConstant(VTBits - 1, dl, MVT::i8))
13333 : DAG.getConstant(0, dl, VT);
13335 SDValue Tmp2, Tmp3;
13336 if (Op.getOpcode() == ISD::SHL_PARTS) {
13337 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13338 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13340 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13341 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13344 // If the shift amount is larger or equal than the width of a part we can't
13345 // rely on the results of shld/shrd. Insert a test and select the appropriate
13346 // values for large shift amounts.
13347 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13348 DAG.getConstant(VTBits, dl, MVT::i8));
13349 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13350 AndNode, DAG.getConstant(0, dl, MVT::i8));
13353 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
13354 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13355 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13357 if (Op.getOpcode() == ISD::SHL_PARTS) {
13358 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13359 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13361 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13362 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13365 SDValue Ops[2] = { Lo, Hi };
13366 return DAG.getMergeValues(Ops, dl);
13369 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13370 SelectionDAG &DAG) const {
13371 SDValue Src = Op.getOperand(0);
13372 MVT SrcVT = Src.getSimpleValueType();
13373 MVT VT = Op.getSimpleValueType();
13376 if (SrcVT.isVector()) {
13377 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
13378 return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
13379 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
13380 DAG.getUNDEF(SrcVT)));
13382 if (SrcVT.getVectorElementType() == MVT::i1) {
13383 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13384 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13385 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
13390 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13391 "Unknown SINT_TO_FP to lower!");
13393 // These are really Legal; return the operand so the caller accepts it as
13395 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13397 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13398 Subtarget.is64Bit()) {
13402 SDValue ValueToStore = Op.getOperand(0);
13403 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13404 !Subtarget.is64Bit())
13405 // Bitcasting to f64 here allows us to do a single 64-bit store from
13406 // an SSE register, avoiding the store forwarding penalty that would come
13407 // with two 32-bit stores.
13408 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13410 unsigned Size = SrcVT.getSizeInBits()/8;
13411 MachineFunction &MF = DAG.getMachineFunction();
13412 auto PtrVT = getPointerTy(MF.getDataLayout());
13413 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13414 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13415 SDValue Chain = DAG.getStore(
13416 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
13417 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
13418 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13421 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13423 SelectionDAG &DAG) const {
13427 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13429 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13431 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13433 unsigned ByteSize = SrcVT.getSizeInBits()/8;
13435 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13436 MachineMemOperand *MMO;
13438 int SSFI = FI->getIndex();
13439 MMO = DAG.getMachineFunction().getMachineMemOperand(
13440 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13441 MachineMemOperand::MOLoad, ByteSize, ByteSize);
13443 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13444 StackSlot = StackSlot.getOperand(1);
13446 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13447 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13449 Tys, Ops, SrcVT, MMO);
13452 Chain = Result.getValue(1);
13453 SDValue InFlag = Result.getValue(2);
13455 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13456 // shouldn't be necessary except that RFP cannot be live across
13457 // multiple blocks. When stackifier is fixed, they can be uncoupled.
13458 MachineFunction &MF = DAG.getMachineFunction();
13459 unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13460 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13461 auto PtrVT = getPointerTy(MF.getDataLayout());
13462 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13463 Tys = DAG.getVTList(MVT::Other);
13465 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13467 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13468 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13469 MachineMemOperand::MOStore, SSFISize, SSFISize);
13471 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13472 Ops, Op.getValueType(), MMO);
13473 Result = DAG.getLoad(
13474 Op.getValueType(), DL, Chain, StackSlot,
13475 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
13481 /// 64-bit unsigned integer to double expansion.
13482 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13483 SelectionDAG &DAG) const {
13484 // This algorithm is not obvious. Here it is what we're trying to output:
13487 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13488 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13490 haddpd %xmm0, %xmm0
13492 pshufd $0x4e, %xmm0, %xmm1
13498 LLVMContext *Context = DAG.getContext();
13500 // Build some magic constants.
13501 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13502 Constant *C0 = ConstantDataVector::get(*Context, CV0);
13503 auto PtrVT = getPointerTy(DAG.getDataLayout());
13504 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
13506 SmallVector<Constant*,2> CV1;
13508 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13509 APInt(64, 0x4330000000000000ULL))));
13511 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13512 APInt(64, 0x4530000000000000ULL))));
13513 Constant *C1 = ConstantVector::get(CV1);
13514 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
13516 // Load the 64-bit value into an XMM register.
13517 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13520 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13521 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13522 /* Alignment = */ 16);
13524 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
13527 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13528 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13529 /* Alignment = */ 16);
13530 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
13531 // TODO: Are there any fast-math-flags to propagate here?
13532 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13535 if (Subtarget.hasSSE3()) {
13536 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13537 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13539 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
13540 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13542 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13543 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
13546 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13547 DAG.getIntPtrConstant(0, dl));
13550 /// 32-bit unsigned integer to float expansion.
13551 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13552 SelectionDAG &DAG) const {
13554 // FP constant to bias correct the final result.
13555 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
13558 // Load the 32-bit value into an XMM register.
13559 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13562 // Zero out the upper parts of the register.
13563 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13565 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13566 DAG.getBitcast(MVT::v2f64, Load),
13567 DAG.getIntPtrConstant(0, dl));
13569 // Or the load with the bias.
13570 SDValue Or = DAG.getNode(
13571 ISD::OR, dl, MVT::v2i64,
13572 DAG.getBitcast(MVT::v2i64,
13573 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
13574 DAG.getBitcast(MVT::v2i64,
13575 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
13577 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13578 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
13580 // Subtract the bias.
13581 // TODO: Are there any fast-math-flags to propagate here?
13582 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13584 // Handle final rounding.
13585 MVT DestVT = Op.getSimpleValueType();
13587 if (DestVT.bitsLT(MVT::f64))
13588 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13589 DAG.getIntPtrConstant(0, dl));
13590 if (DestVT.bitsGT(MVT::f64))
13591 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13593 // Handle final rounding.
13597 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13598 const X86Subtarget &Subtarget) {
13599 // The algorithm is the following:
13600 // #ifdef __SSE4_1__
13601 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13602 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13603 // (uint4) 0x53000000, 0xaa);
13605 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13606 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
13608 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13609 // return (float4) lo + fhi;
13611 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
13612 // reassociate the two FADDs, and if we do that, the algorithm fails
13613 // spectacularly (PR24512).
13614 // FIXME: If we ever have some kind of Machine FMF, this should be marked
13615 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
13616 // there's also the MachineCombiner reassociations happening on Machine IR.
13617 if (DAG.getTarget().Options.UnsafeFPMath)
13621 SDValue V = Op->getOperand(0);
13622 MVT VecIntVT = V.getSimpleValueType();
13623 bool Is128 = VecIntVT == MVT::v4i32;
13624 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13625 // If we convert to something else than the supported type, e.g., to v4f64,
13627 if (VecFloatVT != Op->getSimpleValueType(0))
13630 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13631 "Unsupported custom type");
13633 // In the #idef/#else code, we have in common:
13634 // - The vector of constants:
13640 // Create the splat vector for 0x4b000000.
13641 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
13642 // Create the splat vector for 0x53000000.
13643 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
13645 // Create the right shift.
13646 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
13647 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
13650 if (Subtarget.hasSSE41()) {
13651 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
13652 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13653 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
13654 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
13655 // Low will be bitcasted right away, so do not bother bitcasting back to its
13657 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
13658 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13659 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13660 // (uint4) 0x53000000, 0xaa);
13661 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
13662 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
13663 // High will be bitcasted right away, so do not bother bitcasting back to
13664 // its original type.
13665 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
13666 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13668 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
13669 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13670 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
13671 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
13673 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
13674 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
13677 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
13678 SDValue VecCstFAdd = DAG.getConstantFP(
13679 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
13681 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13682 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
13683 // TODO: Are there any fast-math-flags to propagate here?
13685 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
13686 // return (float4) lo + fhi;
13687 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
13688 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
13691 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
13692 SelectionDAG &DAG) const {
13693 SDValue N0 = Op.getOperand(0);
13694 MVT SVT = N0.getSimpleValueType();
13697 switch (SVT.SimpleTy) {
13699 llvm_unreachable("Custom UINT_TO_FP is not supported!");
13704 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
13705 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13706 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
13710 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
13713 assert(Subtarget.hasAVX512());
13714 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
13715 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
13719 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
13720 SelectionDAG &DAG) const {
13721 SDValue N0 = Op.getOperand(0);
13723 auto PtrVT = getPointerTy(DAG.getDataLayout());
13725 if (Op.getSimpleValueType().isVector())
13726 return lowerUINT_TO_FP_vec(Op, DAG);
13728 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
13729 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
13730 // the optimization here.
13731 if (DAG.SignBitIsZero(N0))
13732 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
13734 MVT SrcVT = N0.getSimpleValueType();
13735 MVT DstVT = Op.getSimpleValueType();
13737 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
13738 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
13739 // Conversions from unsigned i32 to f32/f64 are legal,
13740 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
13744 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
13745 return LowerUINT_TO_FP_i64(Op, DAG);
13746 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
13747 return LowerUINT_TO_FP_i32(Op, DAG);
13748 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
13751 // Make a 64-bit buffer, and use it to build an FILD.
13752 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
13753 if (SrcVT == MVT::i32) {
13754 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
13755 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13756 StackSlot, MachinePointerInfo());
13757 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
13758 OffsetSlot, MachinePointerInfo());
13759 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
13763 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
13764 SDValue ValueToStore = Op.getOperand(0);
13765 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
13766 // Bitcasting to f64 here allows us to do a single 64-bit store from
13767 // an SSE register, avoiding the store forwarding penalty that would come
13768 // with two 32-bit stores.
13769 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13770 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
13771 MachinePointerInfo());
13772 // For i64 source, we need to add the appropriate power of 2 if the input
13773 // was negative. This is the same as the optimization in
13774 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
13775 // we must be careful to do the computation in x87 extended precision, not
13776 // in SSE. (The generic code can't know it's OK to do this, or how to.)
13777 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
13778 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13779 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13780 MachineMemOperand::MOLoad, 8, 8);
13782 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
13783 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
13784 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
13787 APInt FF(32, 0x5F800000ULL);
13789 // Check whether the sign bit is set.
13790 SDValue SignSet = DAG.getSetCC(
13791 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
13792 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
13794 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
13795 SDValue FudgePtr = DAG.getConstantPool(
13796 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
13798 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
13799 SDValue Zero = DAG.getIntPtrConstant(0, dl);
13800 SDValue Four = DAG.getIntPtrConstant(4, dl);
13801 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
13803 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
13805 // Load the value out, extending it from f32 to f80.
13806 // FIXME: Avoid the extend by constructing the right constant pool?
13807 SDValue Fudge = DAG.getExtLoad(
13808 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
13809 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
13810 /* Alignment = */ 4);
13811 // Extend everything to 80 bits to force it to be done on x87.
13812 // TODO: Are there any fast-math-flags to propagate here?
13813 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
13814 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
13815 DAG.getIntPtrConstant(0, dl));
13818 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
13819 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
13820 // just return an <SDValue(), SDValue()> pair.
13821 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
13822 // to i16, i32 or i64, and we lower it to a legal sequence.
13823 // If lowered to the final integer result we return a <result, SDValue()> pair.
13824 // Otherwise we lower it to a sequence ending with a FIST, return a
13825 // <FIST, StackSlot> pair, and the caller is responsible for loading
13826 // the final integer result from StackSlot.
13827 std::pair<SDValue,SDValue>
13828 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
13829 bool IsSigned, bool IsReplace) const {
13832 EVT DstTy = Op.getValueType();
13833 EVT TheVT = Op.getOperand(0).getValueType();
13834 auto PtrVT = getPointerTy(DAG.getDataLayout());
13836 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
13837 // f16 must be promoted before using the lowering in this routine.
13838 // fp128 does not use this lowering.
13839 return std::make_pair(SDValue(), SDValue());
13842 // If using FIST to compute an unsigned i64, we'll need some fixup
13843 // to handle values above the maximum signed i64. A FIST is always
13844 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
13845 bool UnsignedFixup = !IsSigned &&
13846 DstTy == MVT::i64 &&
13847 (!Subtarget.is64Bit() ||
13848 !isScalarFPTypeInSSEReg(TheVT));
13850 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
13851 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
13852 // The low 32 bits of the fist result will have the correct uint32 result.
13853 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
13857 assert(DstTy.getSimpleVT() <= MVT::i64 &&
13858 DstTy.getSimpleVT() >= MVT::i16 &&
13859 "Unknown FP_TO_INT to lower!");
13861 // These are really Legal.
13862 if (DstTy == MVT::i32 &&
13863 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13864 return std::make_pair(SDValue(), SDValue());
13865 if (Subtarget.is64Bit() &&
13866 DstTy == MVT::i64 &&
13867 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13868 return std::make_pair(SDValue(), SDValue());
13870 // We lower FP->int64 into FISTP64 followed by a load from a temporary
13872 MachineFunction &MF = DAG.getMachineFunction();
13873 unsigned MemSize = DstTy.getSizeInBits()/8;
13874 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13875 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13878 switch (DstTy.getSimpleVT().SimpleTy) {
13879 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
13880 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
13881 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
13882 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
13885 SDValue Chain = DAG.getEntryNode();
13886 SDValue Value = Op.getOperand(0);
13887 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
13889 if (UnsignedFixup) {
13891 // Conversion to unsigned i64 is implemented with a select,
13892 // depending on whether the source value fits in the range
13893 // of a signed i64. Let Thresh be the FP equivalent of
13894 // 0x8000000000000000ULL.
13896 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
13897 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
13898 // Fist-to-mem64 FistSrc
13899 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
13900 // to XOR'ing the high 32 bits with Adjust.
13902 // Being a power of 2, Thresh is exactly representable in all FP formats.
13903 // For X87 we'd like to use the smallest FP type for this constant, but
13904 // for DAG type consistency we have to match the FP operand type.
13906 APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
13907 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
13908 bool LosesInfo = false;
13909 if (TheVT == MVT::f64)
13910 // The rounding mode is irrelevant as the conversion should be exact.
13911 Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
13913 else if (TheVT == MVT::f80)
13914 Status = Thresh.convert(APFloat::x87DoubleExtended,
13915 APFloat::rmNearestTiesToEven, &LosesInfo);
13917 assert(Status == APFloat::opOK && !LosesInfo &&
13918 "FP conversion should have been exact");
13920 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
13922 SDValue Cmp = DAG.getSetCC(DL,
13923 getSetCCResultType(DAG.getDataLayout(),
13924 *DAG.getContext(), TheVT),
13925 Value, ThreshVal, ISD::SETLT);
13926 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
13927 DAG.getConstant(0, DL, MVT::i32),
13928 DAG.getConstant(0x80000000, DL, MVT::i32));
13929 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
13930 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
13931 *DAG.getContext(), TheVT),
13932 Value, ThreshVal, ISD::SETLT);
13933 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
13936 // FIXME This causes a redundant load/store if the SSE-class value is already
13937 // in memory, such as if it is on the callstack.
13938 if (isScalarFPTypeInSSEReg(TheVT)) {
13939 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
13940 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
13941 MachinePointerInfo::getFixedStack(MF, SSFI));
13942 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
13944 Chain, StackSlot, DAG.getValueType(TheVT)
13947 MachineMemOperand *MMO =
13948 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13949 MachineMemOperand::MOLoad, MemSize, MemSize);
13950 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
13951 Chain = Value.getValue(1);
13952 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13953 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13956 MachineMemOperand *MMO =
13957 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13958 MachineMemOperand::MOStore, MemSize, MemSize);
13960 if (UnsignedFixup) {
13962 // Insert the FIST, load its result as two i32's,
13963 // and XOR the high i32 with Adjust.
13965 SDValue FistOps[] = { Chain, Value, StackSlot };
13966 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13967 FistOps, DstTy, MMO);
13970 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
13971 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
13974 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
13975 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
13977 if (Subtarget.is64Bit()) {
13978 // Join High32 and Low32 into a 64-bit result.
13979 // (High32 << 32) | Low32
13980 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
13981 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
13982 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
13983 DAG.getConstant(32, DL, MVT::i8));
13984 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
13985 return std::make_pair(Result, SDValue());
13988 SDValue ResultOps[] = { Low32, High32 };
13990 SDValue pair = IsReplace
13991 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
13992 : DAG.getMergeValues(ResultOps, DL);
13993 return std::make_pair(pair, SDValue());
13995 // Build the FP_TO_INT*_IN_MEM
13996 SDValue Ops[] = { Chain, Value, StackSlot };
13997 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13999 return std::make_pair(FIST, StackSlot);
14003 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14004 const X86Subtarget &Subtarget) {
14005 MVT VT = Op->getSimpleValueType(0);
14006 SDValue In = Op->getOperand(0);
14007 MVT InVT = In.getSimpleValueType();
14010 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
14011 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
14013 // Optimize vectors in AVX mode:
14016 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
14017 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
14018 // Concat upper and lower parts.
14021 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
14022 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
14023 // Concat upper and lower parts.
14026 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14027 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14028 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14031 if (Subtarget.hasInt256())
14032 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14034 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14035 SDValue Undef = DAG.getUNDEF(InVT);
14036 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14037 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14038 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14040 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14041 VT.getVectorNumElements()/2);
14043 OpLo = DAG.getBitcast(HVT, OpLo);
14044 OpHi = DAG.getBitcast(HVT, OpHi);
14046 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14049 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14050 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14051 MVT VT = Op->getSimpleValueType(0);
14052 SDValue In = Op->getOperand(0);
14053 MVT InVT = In.getSimpleValueType();
14055 unsigned int NumElts = VT.getVectorNumElements();
14056 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
14059 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14060 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14062 assert(InVT.getVectorElementType() == MVT::i1);
14064 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
14066 if (!VT.is512BitVector() && !Subtarget.hasVLX())
14067 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14070 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
14072 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
14074 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
14076 return SelectedVal;
14077 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
14080 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14081 SelectionDAG &DAG) {
14082 if (Subtarget.hasFp256())
14083 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14089 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14090 SelectionDAG &DAG) {
14092 MVT VT = Op.getSimpleValueType();
14093 SDValue In = Op.getOperand(0);
14094 MVT SVT = In.getSimpleValueType();
14096 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14097 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
14099 if (Subtarget.hasFp256())
14100 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14103 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14104 VT.getVectorNumElements() != SVT.getVectorNumElements());
14108 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
14109 const X86Subtarget &Subtarget) {
14112 MVT VT = Op.getSimpleValueType();
14113 SDValue In = Op.getOperand(0);
14114 MVT InVT = In.getSimpleValueType();
14116 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
14118 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
14119 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
14120 if (InVT.getScalarSizeInBits() <= 16) {
14121 if (Subtarget.hasBWI()) {
14122 // legal, will go to VPMOVB2M, VPMOVW2M
14123 // Shift packed bytes not supported natively, bitcast to word
14124 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
14125 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
14126 DAG.getBitcast(ExtVT, In),
14127 DAG.getConstant(ShiftInx, DL, ExtVT));
14128 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
14129 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
14131 // Use TESTD/Q, extended vector to packed dword/qword.
14132 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
14133 "Unexpected vector type.");
14134 unsigned NumElts = InVT.getVectorNumElements();
14135 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14136 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14138 ShiftInx = InVT.getScalarSizeInBits() - 1;
14141 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
14142 DAG.getConstant(ShiftInx, DL, InVT));
14143 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
14146 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14148 MVT VT = Op.getSimpleValueType();
14149 SDValue In = Op.getOperand(0);
14150 MVT InVT = In.getSimpleValueType();
14152 if (VT == MVT::i1) {
14153 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14154 "Invalid scalar TRUNCATE operation");
14155 if (InVT.getSizeInBits() >= 32)
14157 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14158 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14160 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14161 "Invalid TRUNCATE operation");
14163 if (VT.getVectorElementType() == MVT::i1)
14164 return LowerTruncateVecI1(Op, DAG, Subtarget);
14166 // vpmovqb/w/d, vpmovdb/w, vpmovwb
14167 if (Subtarget.hasAVX512()) {
14168 // word to byte only under BWI
14169 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
14170 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
14171 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
14172 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14174 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14175 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14176 if (Subtarget.hasInt256()) {
14177 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14178 In = DAG.getBitcast(MVT::v8i32, In);
14179 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14181 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14182 DAG.getIntPtrConstant(0, DL));
14185 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14186 DAG.getIntPtrConstant(0, DL));
14187 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14188 DAG.getIntPtrConstant(2, DL));
14189 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14190 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14191 static const int ShufMask[] = {0, 2, 4, 6};
14192 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14195 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14196 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14197 if (Subtarget.hasInt256()) {
14198 In = DAG.getBitcast(MVT::v32i8, In);
14200 SmallVector<SDValue,32> pshufbMask;
14201 for (unsigned i = 0; i < 2; ++i) {
14202 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
14203 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
14204 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
14205 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
14206 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
14207 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
14208 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
14209 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
14210 for (unsigned j = 0; j < 8; ++j)
14211 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
14213 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
14214 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14215 In = DAG.getBitcast(MVT::v4i64, In);
14217 static const int ShufMask[] = {0, 2, -1, -1};
14218 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
14220 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14221 DAG.getIntPtrConstant(0, DL));
14222 return DAG.getBitcast(VT, In);
14225 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14226 DAG.getIntPtrConstant(0, DL));
14228 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14229 DAG.getIntPtrConstant(4, DL));
14231 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
14232 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
14234 // The PSHUFB mask:
14235 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
14236 -1, -1, -1, -1, -1, -1, -1, -1};
14238 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14239 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14240 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14242 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14243 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14245 // The MOVLHPS Mask:
14246 static const int ShufMask2[] = {0, 1, 4, 5};
14247 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14248 return DAG.getBitcast(MVT::v8i16, res);
14251 // Handle truncation of V256 to V128 using shuffles.
14252 if (!VT.is128BitVector() || !InVT.is256BitVector())
14255 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
14257 unsigned NumElems = VT.getVectorNumElements();
14258 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14260 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14261 // Prepare truncation shuffle mask
14262 for (unsigned i = 0; i != NumElems; ++i)
14263 MaskVec[i] = i * 2;
14264 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
14265 DAG.getUNDEF(NVT), MaskVec);
14266 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14267 DAG.getIntPtrConstant(0, DL));
14270 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14271 SelectionDAG &DAG) const {
14272 assert(!Op.getSimpleValueType().isVector());
14274 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14275 /*IsSigned=*/ true, /*IsReplace=*/ false);
14276 SDValue FIST = Vals.first, StackSlot = Vals.second;
14277 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14278 if (!FIST.getNode())
14281 if (StackSlot.getNode())
14282 // Load the result.
14283 return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
14284 MachinePointerInfo());
14286 // The node is the result.
14290 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14291 SelectionDAG &DAG) const {
14292 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14293 /*IsSigned=*/ false, /*IsReplace=*/ false);
14294 SDValue FIST = Vals.first, StackSlot = Vals.second;
14295 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14296 if (!FIST.getNode())
14299 if (StackSlot.getNode())
14300 // Load the result.
14301 return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
14302 MachinePointerInfo());
14304 // The node is the result.
14308 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14310 MVT VT = Op.getSimpleValueType();
14311 SDValue In = Op.getOperand(0);
14312 MVT SVT = In.getSimpleValueType();
14314 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14316 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14317 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14318 In, DAG.getUNDEF(SVT)));
14321 /// The only differences between FABS and FNEG are the mask and the logic op.
14322 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14323 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14324 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14325 "Wrong opcode for lowering FABS or FNEG.");
14327 bool IsFABS = (Op.getOpcode() == ISD::FABS);
14329 // If this is a FABS and it has an FNEG user, bail out to fold the combination
14330 // into an FNABS. We'll lower the FABS after that if it is still in use.
14332 for (SDNode *User : Op->uses())
14333 if (User->getOpcode() == ISD::FNEG)
14337 MVT VT = Op.getSimpleValueType();
14339 bool IsF128 = (VT == MVT::f128);
14341 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14342 // decide if we should generate a 16-byte constant mask when we only need 4 or
14343 // 8 bytes for the scalar case.
14349 if (VT.isVector()) {
14351 EltVT = VT.getVectorElementType();
14352 NumElts = VT.getVectorNumElements();
14353 } else if (IsF128) {
14354 // SSE instructions are used for optimized f128 logical operations.
14355 LogicVT = MVT::f128;
14359 // There are no scalar bitwise logical SSE/AVX instructions, so we
14360 // generate a 16-byte vector constant and logic op even for the scalar case.
14361 // Using a 16-byte mask allows folding the load of the mask with
14362 // the logic op, so it can save (~4 bytes) on code size.
14363 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
14365 NumElts = (VT == MVT::f64) ? 2 : 4;
14368 unsigned EltBits = EltVT.getSizeInBits();
14369 LLVMContext *Context = DAG.getContext();
14370 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14372 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14373 Constant *C = ConstantInt::get(*Context, MaskElt);
14374 C = ConstantVector::getSplat(NumElts, C);
14375 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14376 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
14377 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14378 SDValue Mask = DAG.getLoad(
14379 LogicVT, dl, DAG.getEntryNode(), CPIdx,
14380 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
14382 SDValue Op0 = Op.getOperand(0);
14383 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14385 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14386 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14388 if (VT.isVector() || IsF128)
14389 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14391 // For the scalar case extend to a 128-bit vector, perform the logic op,
14392 // and extract the scalar result back out.
14393 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
14394 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14395 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
14396 DAG.getIntPtrConstant(0, dl));
14399 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14400 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14401 LLVMContext *Context = DAG.getContext();
14402 SDValue Op0 = Op.getOperand(0);
14403 SDValue Op1 = Op.getOperand(1);
14405 MVT VT = Op.getSimpleValueType();
14406 MVT SrcVT = Op1.getSimpleValueType();
14407 bool IsF128 = (VT == MVT::f128);
14409 // If second operand is smaller, extend it first.
14410 if (SrcVT.bitsLT(VT)) {
14411 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14414 // And if it is bigger, shrink it first.
14415 if (SrcVT.bitsGT(VT)) {
14416 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
14420 // At this point the operands and the result should have the same
14421 // type, and that won't be f80 since that is not custom lowered.
14422 assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
14423 "Unexpected type in LowerFCOPYSIGN");
14425 const fltSemantics &Sem =
14426 VT == MVT::f64 ? APFloat::IEEEdouble :
14427 (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
14428 const unsigned SizeInBits = VT.getSizeInBits();
14430 SmallVector<Constant *, 4> CV(
14431 VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
14432 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14434 // First, clear all bits but the sign bit from the second operand (sign).
14435 CV[0] = ConstantFP::get(*Context,
14436 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14437 Constant *C = ConstantVector::get(CV);
14438 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
14439 SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14441 // Perform all logic operations as 16-byte vectors because there are no
14442 // scalar FP logic instructions in SSE. This allows load folding of the
14443 // constants into the logic instructions.
14444 MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
14446 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14447 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14448 /* Alignment = */ 16);
14450 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
14451 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
14453 // Next, clear the sign bit from the first operand (magnitude).
14454 // If it's a constant, we can clear it here.
14455 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14456 APFloat APF = Op0CN->getValueAPF();
14457 // If the magnitude is a positive zero, the sign bit alone is enough.
14458 if (APF.isPosZero())
14459 return IsF128 ? SignBit :
14460 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
14461 DAG.getIntPtrConstant(0, dl));
14463 CV[0] = ConstantFP::get(*Context, APF);
14465 CV[0] = ConstantFP::get(
14467 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14469 C = ConstantVector::get(CV);
14470 CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14472 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14473 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14474 /* Alignment = */ 16);
14475 // If the magnitude operand wasn't a constant, we need to AND out the sign.
14476 if (!isa<ConstantFPSDNode>(Op0)) {
14478 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
14479 Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
14481 // OR the magnitude value with the sign bit.
14482 Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
14483 return IsF128 ? Val :
14484 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
14485 DAG.getIntPtrConstant(0, dl));
14488 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14489 SDValue N0 = Op.getOperand(0);
14491 MVT VT = Op.getSimpleValueType();
14493 MVT OpVT = N0.getSimpleValueType();
14494 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
14495 "Unexpected type for FGETSIGN");
14497 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
14498 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
14499 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
14500 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
14501 Res = DAG.getZExtOrTrunc(Res, dl, VT);
14502 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
14506 // Check whether an OR'd tree is PTEST-able.
14507 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
14508 SelectionDAG &DAG) {
14509 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14511 if (!Subtarget.hasSSE41())
14514 if (!Op->hasOneUse())
14517 SDNode *N = Op.getNode();
14520 SmallVector<SDValue, 8> Opnds;
14521 DenseMap<SDValue, unsigned> VecInMap;
14522 SmallVector<SDValue, 8> VecIns;
14523 EVT VT = MVT::Other;
14525 // Recognize a special case where a vector is casted into wide integer to
14527 Opnds.push_back(N->getOperand(0));
14528 Opnds.push_back(N->getOperand(1));
14530 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14531 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14532 // BFS traverse all OR'd operands.
14533 if (I->getOpcode() == ISD::OR) {
14534 Opnds.push_back(I->getOperand(0));
14535 Opnds.push_back(I->getOperand(1));
14536 // Re-evaluate the number of nodes to be traversed.
14537 e += 2; // 2 more nodes (LHS and RHS) are pushed.
14541 // Quit if a non-EXTRACT_VECTOR_ELT
14542 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14545 // Quit if without a constant index.
14546 SDValue Idx = I->getOperand(1);
14547 if (!isa<ConstantSDNode>(Idx))
14550 SDValue ExtractedFromVec = I->getOperand(0);
14551 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14552 if (M == VecInMap.end()) {
14553 VT = ExtractedFromVec.getValueType();
14554 // Quit if not 128/256-bit vector.
14555 if (!VT.is128BitVector() && !VT.is256BitVector())
14557 // Quit if not the same type.
14558 if (VecInMap.begin() != VecInMap.end() &&
14559 VT != VecInMap.begin()->first.getValueType())
14561 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14562 VecIns.push_back(ExtractedFromVec);
14564 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14567 assert((VT.is128BitVector() || VT.is256BitVector()) &&
14568 "Not extracted from 128-/256-bit vector.");
14570 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14572 for (DenseMap<SDValue, unsigned>::const_iterator
14573 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14574 // Quit if not all elements are used.
14575 if (I->second != FullMask)
14579 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14581 // Cast all vectors into TestVT for PTEST.
14582 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14583 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
14585 // If more than one full vectors are evaluated, OR them first before PTEST.
14586 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14587 // Each iteration will OR 2 nodes and append the result until there is only
14588 // 1 node left, i.e. the final OR'd value of all vectors.
14589 SDValue LHS = VecIns[Slot];
14590 SDValue RHS = VecIns[Slot + 1];
14591 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14594 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14595 VecIns.back(), VecIns.back());
14598 /// \brief return true if \c Op has a use that doesn't just read flags.
14599 static bool hasNonFlagsUse(SDValue Op) {
14600 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14602 SDNode *User = *UI;
14603 unsigned UOpNo = UI.getOperandNo();
14604 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14605 // Look pass truncate.
14606 UOpNo = User->use_begin().getOperandNo();
14607 User = *User->use_begin();
14610 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14611 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14617 // Emit KTEST instruction for bit vectors on AVX-512
14618 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
14619 const X86Subtarget &Subtarget) {
14620 if (Op.getOpcode() == ISD::BITCAST) {
14621 auto hasKTEST = [&](MVT VT) {
14622 unsigned SizeInBits = VT.getSizeInBits();
14623 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
14624 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
14626 SDValue Op0 = Op.getOperand(0);
14627 MVT Op0VT = Op0.getValueType().getSimpleVT();
14628 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
14630 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
14635 /// Emit nodes that will be selected as "test Op0,Op0", or something
14637 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
14638 SelectionDAG &DAG) const {
14639 if (Op.getValueType() == MVT::i1) {
14640 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
14641 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
14642 DAG.getConstant(0, dl, MVT::i8));
14644 // CF and OF aren't always set the way we want. Determine which
14645 // of these we need.
14646 bool NeedCF = false;
14647 bool NeedOF = false;
14650 case X86::COND_A: case X86::COND_AE:
14651 case X86::COND_B: case X86::COND_BE:
14654 case X86::COND_G: case X86::COND_GE:
14655 case X86::COND_L: case X86::COND_LE:
14656 case X86::COND_O: case X86::COND_NO: {
14657 // Check if we really need to set the
14658 // Overflow flag. If NoSignedWrap is present
14659 // that is not actually needed.
14660 switch (Op->getOpcode()) {
14665 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
14666 if (BinNode->Flags.hasNoSignedWrap())
14676 // See if we can use the EFLAGS value from the operand instead of
14677 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14678 // we prove that the arithmetic won't overflow, we can't use OF or CF.
14679 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14680 // Emit KTEST for bit vectors
14681 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14683 // Emit a CMP with 0, which is the TEST pattern.
14684 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14685 DAG.getConstant(0, dl, Op.getValueType()));
14687 unsigned Opcode = 0;
14688 unsigned NumOperands = 0;
14690 // Truncate operations may prevent the merge of the SETCC instruction
14691 // and the arithmetic instruction before it. Attempt to truncate the operands
14692 // of the arithmetic instruction and use a reduced bit-width instruction.
14693 bool NeedTruncation = false;
14694 SDValue ArithOp = Op;
14695 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14696 SDValue Arith = Op->getOperand(0);
14697 // Both the trunc and the arithmetic op need to have one user each.
14698 if (Arith->hasOneUse())
14699 switch (Arith.getOpcode()) {
14706 NeedTruncation = true;
14712 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14713 // which may be the result of a CAST. We use the variable 'Op', which is the
14714 // non-casted variable when we check for possible users.
14715 switch (ArithOp.getOpcode()) {
14717 // Due to an isel shortcoming, be conservative if this add is likely to be
14718 // selected as part of a load-modify-store instruction. When the root node
14719 // in a match is a store, isel doesn't know how to remap non-chain non-flag
14720 // uses of other nodes in the match, such as the ADD in this case. This
14721 // leads to the ADD being left around and reselected, with the result being
14722 // two adds in the output. Alas, even if none our users are stores, that
14723 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
14724 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
14725 // climbing the DAG back to the root, and it doesn't seem to be worth the
14727 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14728 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14729 if (UI->getOpcode() != ISD::CopyToReg &&
14730 UI->getOpcode() != ISD::SETCC &&
14731 UI->getOpcode() != ISD::STORE)
14734 if (ConstantSDNode *C =
14735 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14736 // An add of one will be selected as an INC.
14737 if (C->isOne() && !Subtarget.slowIncDec()) {
14738 Opcode = X86ISD::INC;
14743 // An add of negative one (subtract of one) will be selected as a DEC.
14744 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
14745 Opcode = X86ISD::DEC;
14751 // Otherwise use a regular EFLAGS-setting add.
14752 Opcode = X86ISD::ADD;
14757 // If we have a constant logical shift that's only used in a comparison
14758 // against zero turn it into an equivalent AND. This allows turning it into
14759 // a TEST instruction later.
14760 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14761 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14762 EVT VT = Op.getValueType();
14763 unsigned BitWidth = VT.getSizeInBits();
14764 unsigned ShAmt = Op->getConstantOperandVal(1);
14765 if (ShAmt >= BitWidth) // Avoid undefined shifts.
14767 APInt Mask = ArithOp.getOpcode() == ISD::SRL
14768 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14769 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14770 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14772 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14773 DAG.getConstant(Mask, dl, VT));
14778 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
14779 // because a TEST instruction will be better.
14780 if (!hasNonFlagsUse(Op)) {
14781 SDValue Op0 = ArithOp->getOperand(0);
14782 SDValue Op1 = ArithOp->getOperand(1);
14783 EVT VT = ArithOp.getValueType();
14784 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
14785 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
14787 // But if we can combine this into an ANDN operation, then create an AND
14788 // now and allow it to be pattern matched into an ANDN.
14789 if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
14796 // Due to the ISEL shortcoming noted above, be conservative if this op is
14797 // likely to be selected as part of a load-modify-store instruction.
14798 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14799 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14800 if (UI->getOpcode() == ISD::STORE)
14803 // Otherwise use a regular EFLAGS-setting instruction.
14804 switch (ArithOp.getOpcode()) {
14805 default: llvm_unreachable("unexpected operator!");
14806 case ISD::SUB: Opcode = X86ISD::SUB; break;
14807 case ISD::XOR: Opcode = X86ISD::XOR; break;
14808 case ISD::AND: Opcode = X86ISD::AND; break;
14810 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14811 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
14814 Opcode = X86ISD::OR;
14828 return SDValue(Op.getNode(), 1);
14834 // If we found that truncation is beneficial, perform the truncation and
14836 if (NeedTruncation) {
14837 EVT VT = Op.getValueType();
14838 SDValue WideVal = Op->getOperand(0);
14839 EVT WideVT = WideVal.getValueType();
14840 unsigned ConvertedOp = 0;
14841 // Use a target machine opcode to prevent further DAGCombine
14842 // optimizations that may separate the arithmetic operations
14843 // from the setcc node.
14844 switch (WideVal.getOpcode()) {
14846 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
14847 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
14848 case ISD::AND: ConvertedOp = X86ISD::AND; break;
14849 case ISD::OR: ConvertedOp = X86ISD::OR; break;
14850 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
14854 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14855 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
14856 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
14857 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
14858 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
14864 // Emit KTEST for bit vectors
14865 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14868 // Emit a CMP with 0, which is the TEST pattern.
14869 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14870 DAG.getConstant(0, dl, Op.getValueType()));
14872 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
14873 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
14875 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
14876 DAG.ReplaceAllUsesWith(Op, New);
14877 return SDValue(New.getNode(), 1);
14880 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
14882 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
14883 const SDLoc &dl, SelectionDAG &DAG) const {
14884 if (isNullConstant(Op1))
14885 return EmitTest(Op0, X86CC, dl, DAG);
14887 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
14888 "Unexpected comparison operation for MVT::i1 operands");
14890 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
14891 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
14892 // Only promote the compare up to I32 if it is a 16 bit operation
14893 // with an immediate. 16 bit immediates are to be avoided.
14894 if ((Op0.getValueType() == MVT::i16 &&
14895 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
14896 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
14897 !Subtarget.isAtom()) {
14898 unsigned ExtendOp =
14899 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
14900 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
14901 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
14903 // Use SUB instead of CMP to enable CSE between SUB and CMP.
14904 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
14905 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
14907 return SDValue(Sub.getNode(), 1);
14909 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
14912 /// Convert a comparison if required by the subtarget.
14913 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
14914 SelectionDAG &DAG) const {
14915 // If the subtarget does not support the FUCOMI instruction, floating-point
14916 // comparisons have to be converted.
14917 if (Subtarget.hasCMov() ||
14918 Cmp.getOpcode() != X86ISD::CMP ||
14919 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
14920 !Cmp.getOperand(1).getValueType().isFloatingPoint())
14923 // The instruction selector will select an FUCOM instruction instead of
14924 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
14925 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
14926 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
14928 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
14929 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
14930 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
14931 DAG.getConstant(8, dl, MVT::i8));
14932 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
14934 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
14935 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
14936 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
14939 /// The minimum architected relative accuracy is 2^-12. We need one
14940 /// Newton-Raphson step to have a good float result (24 bits of precision).
14941 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
14942 DAGCombinerInfo &DCI,
14943 unsigned &RefinementSteps,
14944 bool &UseOneConstNR) const {
14945 EVT VT = Op.getValueType();
14946 const char *RecipOp;
14948 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
14949 // TODO: Add support for AVX512 (v16f32).
14950 // It is likely not profitable to do this for f64 because a double-precision
14951 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
14952 // instructions: convert to single, rsqrtss, convert back to double, refine
14953 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
14954 // along with FMA, this could be a throughput win.
14955 if (VT == MVT::f32 && Subtarget.hasSSE1())
14957 else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14958 (VT == MVT::v8f32 && Subtarget.hasAVX()))
14959 RecipOp = "vec-sqrtf";
14963 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14964 if (!Recips.isEnabled(RecipOp))
14967 RefinementSteps = Recips.getRefinementSteps(RecipOp);
14968 UseOneConstNR = false;
14969 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
14972 /// The minimum architected relative accuracy is 2^-12. We need one
14973 /// Newton-Raphson step to have a good float result (24 bits of precision).
14974 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
14975 DAGCombinerInfo &DCI,
14976 unsigned &RefinementSteps) const {
14977 EVT VT = Op.getValueType();
14978 const char *RecipOp;
14980 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
14981 // TODO: Add support for AVX512 (v16f32).
14982 // It is likely not profitable to do this for f64 because a double-precision
14983 // reciprocal estimate with refinement on x86 prior to FMA requires
14984 // 15 instructions: convert to single, rcpss, convert back to double, refine
14985 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
14986 // along with FMA, this could be a throughput win.
14987 if (VT == MVT::f32 && Subtarget.hasSSE1())
14989 else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14990 (VT == MVT::v8f32 && Subtarget.hasAVX()))
14991 RecipOp = "vec-divf";
14995 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14996 if (!Recips.isEnabled(RecipOp))
14999 RefinementSteps = Recips.getRefinementSteps(RecipOp);
15000 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15003 /// If we have at least two divisions that use the same divisor, convert to
15004 /// multplication by a reciprocal. This may need to be adjusted for a given
15005 /// CPU if a division's cost is not at least twice the cost of a multiplication.
15006 /// This is because we still need one division to calculate the reciprocal and
15007 /// then we need two multiplies by that reciprocal as replacements for the
15008 /// original divisions.
15009 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
15013 /// Result of 'and' is compared against zero. Change to a BT node if possible.
15014 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15015 const SDLoc &dl, SelectionDAG &DAG) const {
15016 SDValue Op0 = And.getOperand(0);
15017 SDValue Op1 = And.getOperand(1);
15018 if (Op0.getOpcode() == ISD::TRUNCATE)
15019 Op0 = Op0.getOperand(0);
15020 if (Op1.getOpcode() == ISD::TRUNCATE)
15021 Op1 = Op1.getOperand(0);
15024 if (Op1.getOpcode() == ISD::SHL)
15025 std::swap(Op0, Op1);
15026 if (Op0.getOpcode() == ISD::SHL) {
15027 if (isOneConstant(Op0.getOperand(0))) {
15028 // If we looked past a truncate, check that it's only truncating away
15030 unsigned BitWidth = Op0.getValueSizeInBits();
15031 unsigned AndBitWidth = And.getValueSizeInBits();
15032 if (BitWidth > AndBitWidth) {
15034 DAG.computeKnownBits(Op0, Zeros, Ones);
15035 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15039 RHS = Op0.getOperand(1);
15041 } else if (Op1.getOpcode() == ISD::Constant) {
15042 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15043 uint64_t AndRHSVal = AndRHS->getZExtValue();
15044 SDValue AndLHS = Op0;
15046 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15047 LHS = AndLHS.getOperand(0);
15048 RHS = AndLHS.getOperand(1);
15051 // Use BT if the immediate can't be encoded in a TEST instruction.
15052 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15054 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
15058 if (LHS.getNode()) {
15059 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
15060 // instruction. Since the shift amount is in-range-or-undefined, we know
15061 // that doing a bittest on the i32 value is ok. We extend to i32 because
15062 // the encoding for the i16 version is larger than the i32 version.
15063 // Also promote i16 to i32 for performance / code size reason.
15064 if (LHS.getValueType() == MVT::i8 ||
15065 LHS.getValueType() == MVT::i16)
15066 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15068 // If the operand types disagree, extend the shift amount to match. Since
15069 // BT ignores high bits (like shifts) we can use anyextend.
15070 if (LHS.getValueType() != RHS.getValueType())
15071 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15073 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15074 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15075 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15076 DAG.getConstant(Cond, dl, MVT::i8), BT);
15082 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
15084 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15089 // SSE Condition code mapping:
15098 switch (SetCCOpcode) {
15099 default: llvm_unreachable("Unexpected SETCC condition");
15101 case ISD::SETEQ: SSECC = 0; break;
15103 case ISD::SETGT: Swap = true; // Fallthrough
15105 case ISD::SETOLT: SSECC = 1; break;
15107 case ISD::SETGE: Swap = true; // Fallthrough
15109 case ISD::SETOLE: SSECC = 2; break;
15110 case ISD::SETUO: SSECC = 3; break;
15112 case ISD::SETNE: SSECC = 4; break;
15113 case ISD::SETULE: Swap = true; // Fallthrough
15114 case ISD::SETUGE: SSECC = 5; break;
15115 case ISD::SETULT: Swap = true; // Fallthrough
15116 case ISD::SETUGT: SSECC = 6; break;
15117 case ISD::SETO: SSECC = 7; break;
15119 case ISD::SETONE: SSECC = 8; break;
15122 std::swap(Op0, Op1);
15127 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
15128 /// concatenate the result back.
15129 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15130 MVT VT = Op.getSimpleValueType();
15132 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15133 "Unsupported value type for operation");
15135 unsigned NumElems = VT.getVectorNumElements();
15137 SDValue CC = Op.getOperand(2);
15139 // Extract the LHS vectors
15140 SDValue LHS = Op.getOperand(0);
15141 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
15142 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
15144 // Extract the RHS vectors
15145 SDValue RHS = Op.getOperand(1);
15146 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
15147 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
15149 // Issue the operation on the smaller types and concatenate the result back
15150 MVT EltVT = VT.getVectorElementType();
15151 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15152 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15153 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15154 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15157 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15158 SDValue Op0 = Op.getOperand(0);
15159 SDValue Op1 = Op.getOperand(1);
15160 SDValue CC = Op.getOperand(2);
15161 MVT VT = Op.getSimpleValueType();
15164 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15165 "Unexpected type for boolean compare operation");
15166 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15167 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
15168 DAG.getConstant(-1, dl, VT));
15169 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
15170 DAG.getConstant(-1, dl, VT));
15171 switch (SetCCOpcode) {
15172 default: llvm_unreachable("Unexpected SETCC condition");
15174 // (x == y) -> ~(x ^ y)
15175 return DAG.getNode(ISD::XOR, dl, VT,
15176 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
15177 DAG.getConstant(-1, dl, VT));
15179 // (x != y) -> (x ^ y)
15180 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
15183 // (x > y) -> (x & ~y)
15184 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
15187 // (x < y) -> (~x & y)
15188 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
15191 // (x <= y) -> (~x | y)
15192 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
15195 // (x >=y) -> (x | ~y)
15196 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
15200 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15202 SDValue Op0 = Op.getOperand(0);
15203 SDValue Op1 = Op.getOperand(1);
15204 SDValue CC = Op.getOperand(2);
15205 MVT VT = Op.getSimpleValueType();
15208 assert(VT.getVectorElementType() == MVT::i1 &&
15209 "Cannot set masked compare for this operation");
15211 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15213 bool Unsigned = false;
15216 switch (SetCCOpcode) {
15217 default: llvm_unreachable("Unexpected SETCC condition");
15218 case ISD::SETNE: SSECC = 4; break;
15219 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
15220 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15221 case ISD::SETLT: Swap = true; //fall-through
15222 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
15223 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15224 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15225 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
15226 case ISD::SETULE: Unsigned = true; //fall-through
15227 case ISD::SETLE: SSECC = 2; break;
15231 std::swap(Op0, Op1);
15233 return DAG.getNode(Opc, dl, VT, Op0, Op1);
15234 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15235 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15236 DAG.getConstant(SSECC, dl, MVT::i8));
15239 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15240 /// operand \p Op1. If non-trivial (for example because it's not constant)
15241 /// return an empty value.
15242 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
15243 SelectionDAG &DAG) {
15244 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15248 MVT VT = Op1.getSimpleValueType();
15249 MVT EVT = VT.getVectorElementType();
15250 unsigned n = VT.getVectorNumElements();
15251 SmallVector<SDValue, 8> ULTOp1;
15253 for (unsigned i = 0; i < n; ++i) {
15254 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15255 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
15258 // Avoid underflow.
15259 APInt Val = Elt->getAPIntValue();
15263 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
15266 return DAG.getBuildVector(VT, dl, ULTOp1);
15269 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
15270 SelectionDAG &DAG) {
15271 SDValue Op0 = Op.getOperand(0);
15272 SDValue Op1 = Op.getOperand(1);
15273 SDValue CC = Op.getOperand(2);
15274 MVT VT = Op.getSimpleValueType();
15275 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15276 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15281 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15282 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15286 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15287 assert(VT.getVectorNumElements() <= 16);
15288 Opc = X86ISD::CMPM;
15290 Opc = X86ISD::CMPP;
15291 // The SSE/AVX packed FP comparison nodes are defined with a
15292 // floating-point vector result that matches the operand type. This allows
15293 // them to work with an SSE1 target (integer vector types are not legal).
15294 VT = Op0.getSimpleValueType();
15297 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
15298 // emit two comparisons and a logic op to tie them together.
15299 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
15302 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15304 // LLVM predicate is SETUEQ or SETONE.
15306 unsigned CombineOpc;
15307 if (SetCCOpcode == ISD::SETUEQ) {
15310 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
15311 static_cast<unsigned>(ISD::OR);
15313 assert(SetCCOpcode == ISD::SETONE);
15316 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
15317 static_cast<unsigned>(ISD::AND);
15320 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15321 DAG.getConstant(CC0, dl, MVT::i8));
15322 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15323 DAG.getConstant(CC1, dl, MVT::i8));
15324 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15326 // Handle all other FP comparisons here.
15327 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
15328 DAG.getConstant(SSECC, dl, MVT::i8));
15331 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
15332 // result type of SETCC. The bitcast is expected to be optimized away
15333 // during combining/isel.
15334 if (Opc == X86ISD::CMPP)
15335 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
15340 MVT VTOp0 = Op0.getSimpleValueType();
15341 assert(VTOp0 == Op1.getSimpleValueType() &&
15342 "Expected operands with same type!");
15343 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
15344 "Invalid number of packed elements for source and destination!");
15346 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
15347 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
15348 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
15349 // legalizer firstly checks if the first operand in input to the setcc has
15350 // a legal type. If so, then it promotes the return type to that same type.
15351 // Otherwise, the return type is promoted to the 'next legal type' which,
15352 // for a vector of MVT::i1 is always a 128-bit integer vector type.
15354 // We reach this code only if the following two conditions are met:
15355 // 1. Both return type and operand type have been promoted to wider types
15356 // by the type legalizer.
15357 // 2. The original operand type has been promoted to a 256-bit vector.
15359 // Note that condition 2. only applies for AVX targets.
15360 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
15361 return DAG.getZExtOrTrunc(NewOp, dl, VT);
15364 // The non-AVX512 code below works under the assumption that source and
15365 // destination types are the same.
15366 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
15367 "Value types for source and destination must be the same!");
15369 // Break 256-bit integer vector compare into smaller ones.
15370 if (VT.is256BitVector() && !Subtarget.hasInt256())
15371 return Lower256IntVSETCC(Op, DAG);
15373 // Operands are boolean (vectors of i1)
15374 MVT OpVT = Op1.getSimpleValueType();
15375 if (OpVT.getVectorElementType() == MVT::i1)
15376 return LowerBoolVSETCC_AVX512(Op, DAG);
15378 // The result is boolean, but operands are int/float
15379 if (VT.getVectorElementType() == MVT::i1) {
15380 // In AVX-512 architecture setcc returns mask with i1 elements,
15381 // But there is no compare instruction for i8 and i16 elements in KNL.
15382 // In this case use SSE compare
15383 bool UseAVX512Inst =
15384 (OpVT.is512BitVector() ||
15385 OpVT.getVectorElementType().getSizeInBits() >= 32 ||
15386 (Subtarget.hasBWI() && Subtarget.hasVLX()));
15389 return LowerIntVSETCC_AVX512(Op, DAG);
15391 return DAG.getNode(ISD::TRUNCATE, dl, VT,
15392 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15395 // Lower using XOP integer comparisons.
15396 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
15397 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
15398 // Translate compare code to XOP PCOM compare mode.
15399 unsigned CmpMode = 0;
15400 switch (SetCCOpcode) {
15401 default: llvm_unreachable("Unexpected SETCC condition");
15403 case ISD::SETLT: CmpMode = 0x00; break;
15405 case ISD::SETLE: CmpMode = 0x01; break;
15407 case ISD::SETGT: CmpMode = 0x02; break;
15409 case ISD::SETGE: CmpMode = 0x03; break;
15410 case ISD::SETEQ: CmpMode = 0x04; break;
15411 case ISD::SETNE: CmpMode = 0x05; break;
15414 // Are we comparing unsigned or signed integers?
15415 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
15416 ? X86ISD::VPCOMU : X86ISD::VPCOM;
15418 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15419 DAG.getConstant(CmpMode, dl, MVT::i8));
15422 // We are handling one of the integer comparisons here. Since SSE only has
15423 // GT and EQ comparisons for integer, swapping operands and multiple
15424 // operations may be required for some comparisons.
15426 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15427 bool Subus = false;
15429 switch (SetCCOpcode) {
15430 default: llvm_unreachable("Unexpected SETCC condition");
15431 case ISD::SETNE: Invert = true;
15432 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
15433 case ISD::SETLT: Swap = true;
15434 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
15435 case ISD::SETGE: Swap = true;
15436 case ISD::SETLE: Opc = X86ISD::PCMPGT;
15437 Invert = true; break;
15438 case ISD::SETULT: Swap = true;
15439 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15440 FlipSigns = true; break;
15441 case ISD::SETUGE: Swap = true;
15442 case ISD::SETULE: Opc = X86ISD::PCMPGT;
15443 FlipSigns = true; Invert = true; break;
15446 // Special case: Use min/max operations for SETULE/SETUGE
15447 MVT VET = VT.getVectorElementType();
15449 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15450 || (Subtarget.hasSSE2() && (VET == MVT::i8));
15453 switch (SetCCOpcode) {
15455 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
15456 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
15459 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15462 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15463 if (!MinMax && hasSubus) {
15464 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15466 // t = psubus Op0, Op1
15467 // pcmpeq t, <0..0>
15468 switch (SetCCOpcode) {
15470 case ISD::SETULT: {
15471 // If the comparison is against a constant we can turn this into a
15472 // setule. With psubus, setule does not require a swap. This is
15473 // beneficial because the constant in the register is no longer
15474 // destructed as the destination so it can be hoisted out of a loop.
15475 // Only do this pre-AVX since vpcmp* is no longer destructive.
15476 if (Subtarget.hasAVX())
15478 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
15480 Subus = true; Invert = false; Swap = false;
15484 // Psubus is better than flip-sign because it requires no inversion.
15485 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
15486 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15490 Opc = X86ISD::SUBUS;
15496 std::swap(Op0, Op1);
15498 // Check that the operation in question is available (most are plain SSE2,
15499 // but PCMPGTQ and PCMPEQQ have different requirements).
15500 if (VT == MVT::v2i64) {
15501 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
15502 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
15504 // First cast everything to the right type.
15505 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15506 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15508 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15509 // bits of the inputs before performing those operations. The lower
15510 // compare is always unsigned.
15513 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
15515 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
15516 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
15517 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
15519 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15520 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15522 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15523 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15524 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15526 // Create masks for only the low parts/high parts of the 64 bit integers.
15527 static const int MaskHi[] = { 1, 1, 3, 3 };
15528 static const int MaskLo[] = { 0, 0, 2, 2 };
15529 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15530 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15531 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15533 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15534 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15537 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15539 return DAG.getBitcast(VT, Result);
15542 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
15543 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15544 // pcmpeqd + pshufd + pand.
15545 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
15547 // First cast everything to the right type.
15548 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15549 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15552 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15554 // Make sure the lower and upper halves are both all-ones.
15555 static const int Mask[] = { 1, 0, 3, 2 };
15556 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15557 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15560 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15562 return DAG.getBitcast(VT, Result);
15566 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15567 // bits of the inputs before performing those operations.
15569 MVT EltVT = VT.getVectorElementType();
15570 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
15572 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15573 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15576 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15578 // If the logical-not of the result is required, perform that now.
15580 Result = DAG.getNOT(dl, Result, VT);
15583 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15586 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15587 getZeroVector(VT, Subtarget, DAG, dl));
15592 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15594 MVT VT = Op.getSimpleValueType();
15596 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15598 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15599 && "SetCC type must be 8-bit or 1-bit integer");
15600 SDValue Op0 = Op.getOperand(0);
15601 SDValue Op1 = Op.getOperand(1);
15603 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15605 // Optimize to BT if possible.
15606 // Lower (X & (1 << N)) == 0 to BT(X, N).
15607 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15608 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15609 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15610 isNullConstant(Op1) &&
15611 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15612 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
15613 if (VT == MVT::i1) {
15614 NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
15615 DAG.getValueType(MVT::i1));
15616 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15622 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
15624 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
15625 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15627 // If the input is a setcc, then reuse the input setcc or use a new one with
15628 // the inverted condition.
15629 if (Op0.getOpcode() == X86ISD::SETCC) {
15630 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15631 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
15635 CCode = X86::GetOppositeBranchCondition(CCode);
15636 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15637 DAG.getConstant(CCode, dl, MVT::i8),
15638 Op0.getOperand(1));
15639 if (VT == MVT::i1) {
15640 SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15641 DAG.getValueType(MVT::i1));
15642 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15647 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15648 if (isOneConstant(Op1)) {
15649 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15650 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
15652 if (!isNullConstant(Op1)) {
15653 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
15654 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
15658 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15659 unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
15660 if (X86CC == X86::COND_INVALID)
15663 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15664 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15665 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15666 DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
15667 if (VT == MVT::i1) {
15668 SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15669 DAG.getValueType(MVT::i1));
15670 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15675 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
15676 SDValue LHS = Op.getOperand(0);
15677 SDValue RHS = Op.getOperand(1);
15678 SDValue Carry = Op.getOperand(2);
15679 SDValue Cond = Op.getOperand(3);
15682 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
15683 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
15685 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
15686 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15687 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
15688 SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15689 DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
15690 if (Op.getSimpleValueType() == MVT::i1) {
15691 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
15692 DAG.getValueType(MVT::i1));
15693 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
15698 /// Return true if opcode is a X86 logical comparison.
15699 static bool isX86LogicalCmp(SDValue Op) {
15700 unsigned Opc = Op.getNode()->getOpcode();
15701 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15702 Opc == X86ISD::SAHF)
15704 if (Op.getResNo() == 1 &&
15705 (Opc == X86ISD::ADD ||
15706 Opc == X86ISD::SUB ||
15707 Opc == X86ISD::ADC ||
15708 Opc == X86ISD::SBB ||
15709 Opc == X86ISD::SMUL ||
15710 Opc == X86ISD::UMUL ||
15711 Opc == X86ISD::INC ||
15712 Opc == X86ISD::DEC ||
15713 Opc == X86ISD::OR ||
15714 Opc == X86ISD::XOR ||
15715 Opc == X86ISD::AND))
15718 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15724 /// Returns the "condition" node, that may be wrapped with "truncate".
15725 /// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
15726 static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15727 if (V.getOpcode() != ISD::TRUNCATE)
15730 SDValue VOp0 = V.getOperand(0);
15731 if (VOp0.getOpcode() == ISD::AssertZext &&
15732 V.getValueSizeInBits() ==
15733 cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
15734 return VOp0.getOperand(0);
15736 unsigned InBits = VOp0.getValueSizeInBits();
15737 unsigned Bits = V.getValueSizeInBits();
15738 if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
15739 return V.getOperand(0);
15743 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15744 bool addTest = true;
15745 SDValue Cond = Op.getOperand(0);
15746 SDValue Op1 = Op.getOperand(1);
15747 SDValue Op2 = Op.getOperand(2);
15749 MVT VT = Op1.getSimpleValueType();
15752 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15753 // are available or VBLENDV if AVX is available.
15754 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
15755 if (Cond.getOpcode() == ISD::SETCC &&
15756 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15757 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
15758 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
15759 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15760 int SSECC = translateX86FSETCC(
15761 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15764 if (Subtarget.hasAVX512()) {
15765 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15766 DAG.getConstant(SSECC, DL, MVT::i8));
15767 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15770 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15771 DAG.getConstant(SSECC, DL, MVT::i8));
15773 // If we have AVX, we can use a variable vector select (VBLENDV) instead
15774 // of 3 logic instructions for size savings and potentially speed.
15775 // Unfortunately, there is no scalar form of VBLENDV.
15777 // If either operand is a constant, don't try this. We can expect to
15778 // optimize away at least one of the logic instructions later in that
15779 // case, so that sequence would be faster than a variable blend.
15781 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
15782 // uses XMM0 as the selection register. That may need just as many
15783 // instructions as the AND/ANDN/OR sequence due to register moves, so
15786 if (Subtarget.hasAVX() &&
15787 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
15789 // Convert to vectors, do a VSELECT, and convert back to scalar.
15790 // All of the conversions should be optimized away.
15792 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
15793 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
15794 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
15795 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
15797 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
15798 VCmp = DAG.getBitcast(VCmpVT, VCmp);
15800 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
15802 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
15803 VSel, DAG.getIntPtrConstant(0, DL));
15805 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15806 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15807 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15811 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
15813 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
15814 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
15815 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
15816 Op1Scalar = Op1.getOperand(0);
15818 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
15819 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
15820 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
15821 Op2Scalar = Op2.getOperand(0);
15822 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
15823 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
15824 Op1Scalar.getValueType(),
15825 Cond, Op1Scalar, Op2Scalar);
15826 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
15827 return DAG.getBitcast(VT, newSelect);
15828 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
15829 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
15830 DAG.getIntPtrConstant(0, DL));
15834 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
15835 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
15836 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15837 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
15838 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15839 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
15840 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
15842 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
15845 if (Cond.getOpcode() == ISD::SETCC) {
15846 if (SDValue NewCond = LowerSETCC(Cond, DAG))
15850 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15851 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15852 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15853 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15854 if (Cond.getOpcode() == X86ISD::SETCC &&
15855 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15856 isNullConstant(Cond.getOperand(1).getOperand(1))) {
15857 SDValue Cmp = Cond.getOperand(1);
15859 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15861 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15862 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15863 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
15865 SDValue CmpOp0 = Cmp.getOperand(0);
15866 // Apply further optimizations for special cases
15867 // (select (x != 0), -1, 0) -> neg & sbb
15868 // (select (x == 0), 0, -1) -> neg & sbb
15869 if (isNullConstant(Y) &&
15870 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
15871 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15872 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15873 DAG.getConstant(0, DL,
15874 CmpOp0.getValueType()),
15876 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15877 DAG.getConstant(X86::COND_B, DL, MVT::i8),
15878 SDValue(Neg.getNode(), 1));
15882 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15883 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
15884 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15886 SDValue Res = // Res = 0 or -1.
15887 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15888 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
15890 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
15891 Res = DAG.getNOT(DL, Res, Res.getValueType());
15893 if (!isNullConstant(Op2))
15894 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15899 // Look past (and (setcc_carry (cmp ...)), 1).
15900 if (Cond.getOpcode() == ISD::AND &&
15901 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
15902 isOneConstant(Cond.getOperand(1)))
15903 Cond = Cond.getOperand(0);
15905 // If condition flag is set by a X86ISD::CMP, then use it as the condition
15906 // setting operand in place of the X86ISD::SETCC.
15907 unsigned CondOpcode = Cond.getOpcode();
15908 if (CondOpcode == X86ISD::SETCC ||
15909 CondOpcode == X86ISD::SETCC_CARRY) {
15910 CC = Cond.getOperand(0);
15912 SDValue Cmp = Cond.getOperand(1);
15913 unsigned Opc = Cmp.getOpcode();
15914 MVT VT = Op.getSimpleValueType();
15916 bool IllegalFPCMov = false;
15917 if (VT.isFloatingPoint() && !VT.isVector() &&
15918 !isScalarFPTypeInSSEReg(VT)) // FPStack?
15919 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15921 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15922 Opc == X86ISD::BT) { // FIXME
15926 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15927 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15928 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15929 Cond.getOperand(0).getValueType() != MVT::i8)) {
15930 SDValue LHS = Cond.getOperand(0);
15931 SDValue RHS = Cond.getOperand(1);
15932 unsigned X86Opcode;
15935 switch (CondOpcode) {
15936 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15937 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15938 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15939 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15940 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15941 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15942 default: llvm_unreachable("unexpected overflowing operator");
15944 if (CondOpcode == ISD::UMULO)
15945 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15948 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15950 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15952 if (CondOpcode == ISD::UMULO)
15953 Cond = X86Op.getValue(2);
15955 Cond = X86Op.getValue(1);
15957 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
15962 // Look past the truncate if the high bits are known zero.
15963 Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
15965 // We know the result of AND is compared against zero. Try to match
15967 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15968 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
15969 CC = NewSetCC.getOperand(0);
15970 Cond = NewSetCC.getOperand(1);
15977 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
15978 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15981 // a < b ? -1 : 0 -> RES = ~setcc_carry
15982 // a < b ? 0 : -1 -> RES = setcc_carry
15983 // a >= b ? -1 : 0 -> RES = setcc_carry
15984 // a >= b ? 0 : -1 -> RES = ~setcc_carry
15985 if (Cond.getOpcode() == X86ISD::SUB) {
15986 Cond = ConvertCmpIfNecessary(Cond, DAG);
15987 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15989 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15990 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15991 (isNullConstant(Op1) || isNullConstant(Op2))) {
15992 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15993 DAG.getConstant(X86::COND_B, DL, MVT::i8),
15995 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
15996 return DAG.getNOT(DL, Res, Res.getValueType());
16001 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16002 // widen the cmov and push the truncate through. This avoids introducing a new
16003 // branch during isel and doesn't add any extensions.
16004 if (Op.getValueType() == MVT::i8 &&
16005 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16006 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16007 if (T1.getValueType() == T2.getValueType() &&
16008 // Blacklist CopyFromReg to avoid partial register stalls.
16009 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16010 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16011 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16012 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16016 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16017 // condition is true.
16018 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16019 SDValue Ops[] = { Op2, Op1, CC, Cond };
16020 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16023 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
16024 const X86Subtarget &Subtarget,
16025 SelectionDAG &DAG) {
16026 MVT VT = Op->getSimpleValueType(0);
16027 SDValue In = Op->getOperand(0);
16028 MVT InVT = In.getSimpleValueType();
16029 MVT VTElt = VT.getVectorElementType();
16030 MVT InVTElt = InVT.getVectorElementType();
16034 if ((InVTElt == MVT::i1) &&
16035 (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
16036 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16038 ((Subtarget.hasBWI() && VT.is512BitVector() &&
16039 VTElt.getSizeInBits() <= 16)) ||
16041 ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
16042 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16044 ((Subtarget.hasDQI() && VT.is512BitVector() &&
16045 VTElt.getSizeInBits() >= 32))))
16046 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16048 unsigned int NumElts = VT.getVectorNumElements();
16050 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
16053 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16054 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16055 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16056 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16059 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16060 MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
16062 DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
16065 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
16067 SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
16068 if (VT.is512BitVector())
16070 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
16073 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
16074 const X86Subtarget &Subtarget,
16075 SelectionDAG &DAG) {
16076 SDValue In = Op->getOperand(0);
16077 MVT VT = Op->getSimpleValueType(0);
16078 MVT InVT = In.getSimpleValueType();
16079 assert(VT.getSizeInBits() == InVT.getSizeInBits());
16081 MVT SVT = VT.getVectorElementType();
16082 MVT InSVT = InVT.getVectorElementType();
16083 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
16085 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
16087 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
16089 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
16090 !(VT.is256BitVector() && Subtarget.hasInt256()))
16095 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
16096 if (VT.is256BitVector())
16097 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
16098 MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
16099 In, DAG.getIntPtrConstant(0, dl));
16101 // SSE41 targets can use the pmovsx* instructions directly.
16102 if (Subtarget.hasSSE41())
16103 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16105 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
16109 // As SRAI is only available on i16/i32 types, we expand only up to i32
16110 // and handle i64 separately.
16111 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
16112 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
16113 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
16114 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
16115 Curr = DAG.getBitcast(CurrVT, Curr);
16118 SDValue SignExt = Curr;
16119 if (CurrVT != InVT) {
16120 unsigned SignExtShift =
16121 CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
16122 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16123 DAG.getConstant(SignExtShift, dl, MVT::i8));
16129 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
16130 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16131 DAG.getConstant(31, dl, MVT::i8));
16132 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
16133 return DAG.getBitcast(VT, Ext);
16139 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16140 SelectionDAG &DAG) {
16141 MVT VT = Op->getSimpleValueType(0);
16142 SDValue In = Op->getOperand(0);
16143 MVT InVT = In.getSimpleValueType();
16146 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16147 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16149 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16150 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16151 (VT != MVT::v16i16 || InVT != MVT::v16i8))
16154 if (Subtarget.hasInt256())
16155 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16157 // Optimize vectors in AVX mode
16158 // Sign extend v8i16 to v8i32 and
16161 // Divide input vector into two parts
16162 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16163 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16164 // concat the vectors to original VT
16166 unsigned NumElems = InVT.getVectorNumElements();
16167 SDValue Undef = DAG.getUNDEF(InVT);
16169 SmallVector<int,8> ShufMask1(NumElems, -1);
16170 for (unsigned i = 0; i != NumElems/2; ++i)
16173 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
16175 SmallVector<int,8> ShufMask2(NumElems, -1);
16176 for (unsigned i = 0; i != NumElems/2; ++i)
16177 ShufMask2[i] = i + NumElems/2;
16179 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
16181 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
16182 VT.getVectorNumElements()/2);
16184 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16185 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16187 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16190 // Lower truncating store. We need a special lowering to vXi1 vectors
16191 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
16192 SelectionDAG &DAG) {
16193 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
16195 EVT MemVT = St->getMemoryVT();
16196 assert(St->isTruncatingStore() && "We only custom truncating store.");
16197 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
16198 "Expected truncstore of i1 vector");
16200 SDValue Op = St->getValue();
16201 MVT OpVT = Op.getValueType().getSimpleVT();
16202 unsigned NumElts = OpVT.getVectorNumElements();
16203 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16205 // Truncate and store - everything is legal
16206 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
16207 if (MemVT.getSizeInBits() < 8)
16208 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
16209 DAG.getUNDEF(MVT::v8i1), Op,
16210 DAG.getIntPtrConstant(0, dl));
16211 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16212 St->getMemOperand());
16215 // A subset, assume that we have only AVX-512F
16216 if (NumElts <= 8) {
16218 // Extend to 8-elts vector
16219 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
16220 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
16221 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
16223 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
16224 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16225 St->getMemOperand());
16228 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
16229 // Divide the vector into 2 parts and store each part separately
16230 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16231 DAG.getIntPtrConstant(0, dl));
16232 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
16233 SDValue BasePtr = St->getBasePtr();
16234 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
16235 St->getMemOperand());
16236 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16237 DAG.getIntPtrConstant(16, dl));
16238 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
16240 SDValue BasePtrHi =
16241 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16242 DAG.getConstant(2, dl, BasePtr.getValueType()));
16244 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
16245 BasePtrHi, St->getMemOperand());
16246 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
16249 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
16250 const X86Subtarget &Subtarget,
16251 SelectionDAG &DAG) {
16253 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16255 EVT MemVT = Ld->getMemoryVT();
16256 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
16257 "Expected i1 vector load");
16258 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
16259 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16260 MVT VT = Op.getValueType().getSimpleVT();
16261 unsigned NumElts = VT.getVectorNumElements();
16263 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16265 // Load and extend - everything is legal
16267 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
16269 Ld->getMemOperand());
16270 // Replace chain users with the new chain.
16271 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16272 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16273 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16274 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
16276 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16277 DAG.getIntPtrConstant(0, dl));
16279 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
16281 Ld->getMemOperand());
16282 // Replace chain users with the new chain.
16283 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16284 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16286 // Finally, do a normal sign-extend to the desired register.
16287 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
16290 if (NumElts <= 8) {
16291 // A subset, assume that we have only AVX-512F
16292 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
16293 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
16294 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
16296 Ld->getMemOperand());
16297 // Replace chain users with the new chain.
16298 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16299 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16301 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
16302 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
16305 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
16307 // we should take care to v4i1 and v2i1
16309 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16310 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
16311 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16312 DAG.getIntPtrConstant(0, dl));
16315 assert(VT == MVT::v32i8 && "Unexpected extload type");
16317 SmallVector<SDValue, 2> Chains;
16319 SDValue BasePtr = Ld->getBasePtr();
16320 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16322 Ld->getMemOperand());
16323 Chains.push_back(LoadLo.getValue(1));
16325 SDValue BasePtrHi =
16326 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16327 DAG.getConstant(2, dl, BasePtr.getValueType()));
16329 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16331 Ld->getMemOperand());
16332 Chains.push_back(LoadHi.getValue(1));
16333 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16334 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
16336 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
16337 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
16338 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
16341 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16342 // may emit an illegal shuffle but the expansion is still better than scalar
16343 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16344 // we'll emit a shuffle and a arithmetic shift.
16345 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16346 // TODO: It is possible to support ZExt by zeroing the undef values during
16347 // the shuffle phase or after the shuffle.
16348 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
16349 SelectionDAG &DAG) {
16350 MVT RegVT = Op.getSimpleValueType();
16351 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16352 assert(RegVT.isInteger() &&
16353 "We only custom lower integer vector sext loads.");
16355 // Nothing useful we can do without SSE2 shuffles.
16356 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
16358 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16360 EVT MemVT = Ld->getMemoryVT();
16361 if (MemVT.getScalarType() == MVT::i1)
16362 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
16364 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16365 unsigned RegSz = RegVT.getSizeInBits();
16367 ISD::LoadExtType Ext = Ld->getExtensionType();
16369 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16370 && "Only anyext and sext are currently implemented.");
16371 assert(MemVT != RegVT && "Cannot extend to the same type");
16372 assert(MemVT.isVector() && "Must load a vector from memory");
16374 unsigned NumElems = RegVT.getVectorNumElements();
16375 unsigned MemSz = MemVT.getSizeInBits();
16376 assert(RegSz > MemSz && "Register size must be greater than the mem size");
16378 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
16379 // The only way in which we have a legal 256-bit vector result but not the
16380 // integer 256-bit operations needed to directly lower a sextload is if we
16381 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16382 // a 128-bit vector and a normal sign_extend to 256-bits that should get
16383 // correctly legalized. We do this late to allow the canonical form of
16384 // sextload to persist throughout the rest of the DAG combiner -- it wants
16385 // to fold together any extensions it can, and so will fuse a sign_extend
16386 // of an sextload into a sextload targeting a wider value.
16388 if (MemSz == 128) {
16389 // Just switch this to a normal load.
16390 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16391 "it must be a legal 128-bit vector "
16393 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16394 Ld->getPointerInfo(), Ld->getAlignment(),
16395 Ld->getMemOperand()->getFlags());
16397 assert(MemSz < 128 &&
16398 "Can't extend a type wider than 128 bits to a 256 bit vector!");
16399 // Do an sext load to a 128-bit vector type. We want to use the same
16400 // number of elements, but elements half as wide. This will end up being
16401 // recursively lowered by this routine, but will succeed as we definitely
16402 // have all the necessary features if we're using AVX1.
16404 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16405 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16407 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16408 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
16409 Ld->getMemOperand()->getFlags());
16412 // Replace chain users with the new chain.
16413 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16414 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16416 // Finally, do a normal sign-extend to the desired register.
16417 return DAG.getSExtOrTrunc(Load, dl, RegVT);
16420 // All sizes must be a power of two.
16421 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16422 "Non-power-of-two elements are not custom lowered!");
16424 // Attempt to load the original value using scalar loads.
16425 // Find the largest scalar type that divides the total loaded size.
16426 MVT SclrLoadTy = MVT::i8;
16427 for (MVT Tp : MVT::integer_valuetypes()) {
16428 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16433 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16434 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16436 SclrLoadTy = MVT::f64;
16438 // Calculate the number of scalar loads that we need to perform
16439 // in order to load our vector from memory.
16440 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16442 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16443 "Can only lower sext loads with a single scalar load!");
16445 unsigned loadRegZize = RegSz;
16446 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
16449 // Represent our vector as a sequence of elements which are the
16450 // largest scalar that we can load.
16451 EVT LoadUnitVecVT = EVT::getVectorVT(
16452 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16454 // Represent the data using the same element type that is stored in
16455 // memory. In practice, we ''widen'' MemVT.
16457 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16458 loadRegZize / MemVT.getScalarSizeInBits());
16460 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16461 "Invalid vector type");
16463 // We can't shuffle using an illegal type.
16464 assert(TLI.isTypeLegal(WideVecVT) &&
16465 "We only lower types that form legal widened vector types");
16467 SmallVector<SDValue, 8> Chains;
16468 SDValue Ptr = Ld->getBasePtr();
16469 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
16470 TLI.getPointerTy(DAG.getDataLayout()));
16471 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16473 for (unsigned i = 0; i < NumLoads; ++i) {
16474 // Perform a single load.
16475 SDValue ScalarLoad =
16476 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16477 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
16478 Chains.push_back(ScalarLoad.getValue(1));
16479 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16480 // another round of DAGCombining.
16482 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16484 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16485 ScalarLoad, DAG.getIntPtrConstant(i, dl));
16487 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16490 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16492 // Bitcast the loaded value to a vector of the original element type, in
16493 // the size of the target vector type.
16494 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
16495 unsigned SizeRatio = RegSz / MemSz;
16497 if (Ext == ISD::SEXTLOAD) {
16498 // If we have SSE4.1, we can directly emit a VSEXT node.
16499 if (Subtarget.hasSSE41()) {
16500 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16501 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16505 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
16507 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
16508 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
16510 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
16511 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16515 // Redistribute the loaded elements into the different locations.
16516 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16517 for (unsigned i = 0; i != NumElems; ++i)
16518 ShuffleVec[i * SizeRatio] = i;
16520 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16521 DAG.getUNDEF(WideVecVT), ShuffleVec);
16523 // Bitcast to the requested type.
16524 Shuff = DAG.getBitcast(RegVT, Shuff);
16525 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16529 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
16530 /// each of which has no other use apart from the AND / OR.
16531 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16532 Opc = Op.getOpcode();
16533 if (Opc != ISD::OR && Opc != ISD::AND)
16535 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16536 Op.getOperand(0).hasOneUse() &&
16537 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16538 Op.getOperand(1).hasOneUse());
16541 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
16542 /// SETCC node has a single use.
16543 static bool isXor1OfSetCC(SDValue Op) {
16544 if (Op.getOpcode() != ISD::XOR)
16546 if (isOneConstant(Op.getOperand(1)))
16547 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16548 Op.getOperand(0).hasOneUse();
16552 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16553 bool addTest = true;
16554 SDValue Chain = Op.getOperand(0);
16555 SDValue Cond = Op.getOperand(1);
16556 SDValue Dest = Op.getOperand(2);
16559 bool Inverted = false;
16561 if (Cond.getOpcode() == ISD::SETCC) {
16562 // Check for setcc([su]{add,sub,mul}o == 0).
16563 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16564 isNullConstant(Cond.getOperand(1)) &&
16565 Cond.getOperand(0).getResNo() == 1 &&
16566 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16567 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16568 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16569 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16570 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16571 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16573 Cond = Cond.getOperand(0);
16575 if (SDValue NewCond = LowerSETCC(Cond, DAG))
16580 // FIXME: LowerXALUO doesn't handle these!!
16581 else if (Cond.getOpcode() == X86ISD::ADD ||
16582 Cond.getOpcode() == X86ISD::SUB ||
16583 Cond.getOpcode() == X86ISD::SMUL ||
16584 Cond.getOpcode() == X86ISD::UMUL)
16585 Cond = LowerXALUO(Cond, DAG);
16588 // Look pass (and (setcc_carry (cmp ...)), 1).
16589 if (Cond.getOpcode() == ISD::AND &&
16590 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
16591 isOneConstant(Cond.getOperand(1)))
16592 Cond = Cond.getOperand(0);
16594 // If condition flag is set by a X86ISD::CMP, then use it as the condition
16595 // setting operand in place of the X86ISD::SETCC.
16596 unsigned CondOpcode = Cond.getOpcode();
16597 if (CondOpcode == X86ISD::SETCC ||
16598 CondOpcode == X86ISD::SETCC_CARRY) {
16599 CC = Cond.getOperand(0);
16601 SDValue Cmp = Cond.getOperand(1);
16602 unsigned Opc = Cmp.getOpcode();
16603 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16604 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16608 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16612 // These can only come from an arithmetic instruction with overflow,
16613 // e.g. SADDO, UADDO.
16614 Cond = Cond.getNode()->getOperand(1);
16620 CondOpcode = Cond.getOpcode();
16621 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16622 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16623 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16624 Cond.getOperand(0).getValueType() != MVT::i8)) {
16625 SDValue LHS = Cond.getOperand(0);
16626 SDValue RHS = Cond.getOperand(1);
16627 unsigned X86Opcode;
16630 // Keep this in sync with LowerXALUO, otherwise we might create redundant
16631 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16633 switch (CondOpcode) {
16634 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16636 if (isOneConstant(RHS)) {
16637 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16640 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16641 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16643 if (isOneConstant(RHS)) {
16644 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16647 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16648 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16649 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16650 default: llvm_unreachable("unexpected overflowing operator");
16653 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16654 if (CondOpcode == ISD::UMULO)
16655 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16658 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16660 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16662 if (CondOpcode == ISD::UMULO)
16663 Cond = X86Op.getValue(2);
16665 Cond = X86Op.getValue(1);
16667 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16671 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16672 SDValue Cmp = Cond.getOperand(0).getOperand(1);
16673 if (CondOpc == ISD::OR) {
16674 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16675 // two branches instead of an explicit OR instruction with a
16677 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16678 isX86LogicalCmp(Cmp)) {
16679 CC = Cond.getOperand(0).getOperand(0);
16680 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16681 Chain, Dest, CC, Cmp);
16682 CC = Cond.getOperand(1).getOperand(0);
16686 } else { // ISD::AND
16687 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16688 // two branches instead of an explicit AND instruction with a
16689 // separate test. However, we only do this if this block doesn't
16690 // have a fall-through edge, because this requires an explicit
16691 // jmp when the condition is false.
16692 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16693 isX86LogicalCmp(Cmp) &&
16694 Op.getNode()->hasOneUse()) {
16695 X86::CondCode CCode =
16696 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16697 CCode = X86::GetOppositeBranchCondition(CCode);
16698 CC = DAG.getConstant(CCode, dl, MVT::i8);
16699 SDNode *User = *Op.getNode()->use_begin();
16700 // Look for an unconditional branch following this conditional branch.
16701 // We need this because we need to reverse the successors in order
16702 // to implement FCMP_OEQ.
16703 if (User->getOpcode() == ISD::BR) {
16704 SDValue FalseBB = User->getOperand(1);
16706 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16707 assert(NewBR == User);
16711 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16712 Chain, Dest, CC, Cmp);
16713 X86::CondCode CCode =
16714 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16715 CCode = X86::GetOppositeBranchCondition(CCode);
16716 CC = DAG.getConstant(CCode, dl, MVT::i8);
16722 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16723 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16724 // It should be transformed during dag combiner except when the condition
16725 // is set by a arithmetics with overflow node.
16726 X86::CondCode CCode =
16727 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16728 CCode = X86::GetOppositeBranchCondition(CCode);
16729 CC = DAG.getConstant(CCode, dl, MVT::i8);
16730 Cond = Cond.getOperand(0).getOperand(1);
16732 } else if (Cond.getOpcode() == ISD::SETCC &&
16733 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16734 // For FCMP_OEQ, we can emit
16735 // two branches instead of an explicit AND instruction with a
16736 // separate test. However, we only do this if this block doesn't
16737 // have a fall-through edge, because this requires an explicit
16738 // jmp when the condition is false.
16739 if (Op.getNode()->hasOneUse()) {
16740 SDNode *User = *Op.getNode()->use_begin();
16741 // Look for an unconditional branch following this conditional branch.
16742 // We need this because we need to reverse the successors in order
16743 // to implement FCMP_OEQ.
16744 if (User->getOpcode() == ISD::BR) {
16745 SDValue FalseBB = User->getOperand(1);
16747 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16748 assert(NewBR == User);
16752 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16753 Cond.getOperand(0), Cond.getOperand(1));
16754 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16755 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16756 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16757 Chain, Dest, CC, Cmp);
16758 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
16763 } else if (Cond.getOpcode() == ISD::SETCC &&
16764 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16765 // For FCMP_UNE, we can emit
16766 // two branches instead of an explicit AND instruction with a
16767 // separate test. However, we only do this if this block doesn't
16768 // have a fall-through edge, because this requires an explicit
16769 // jmp when the condition is false.
16770 if (Op.getNode()->hasOneUse()) {
16771 SDNode *User = *Op.getNode()->use_begin();
16772 // Look for an unconditional branch following this conditional branch.
16773 // We need this because we need to reverse the successors in order
16774 // to implement FCMP_UNE.
16775 if (User->getOpcode() == ISD::BR) {
16776 SDValue FalseBB = User->getOperand(1);
16778 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16779 assert(NewBR == User);
16782 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16783 Cond.getOperand(0), Cond.getOperand(1));
16784 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16785 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16786 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16787 Chain, Dest, CC, Cmp);
16788 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
16798 // Look pass the truncate if the high bits are known zero.
16799 Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
16801 // We know the result of AND is compared against zero. Try to match
16803 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16804 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
16805 CC = NewSetCC.getOperand(0);
16806 Cond = NewSetCC.getOperand(1);
16813 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16814 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16815 Cond = EmitTest(Cond, X86Cond, dl, DAG);
16817 Cond = ConvertCmpIfNecessary(Cond, DAG);
16818 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16819 Chain, Dest, CC, Cond);
16822 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16823 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16824 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16825 // that the guard pages used by the OS virtual memory manager are allocated in
16826 // correct sequence.
16828 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16829 SelectionDAG &DAG) const {
16830 MachineFunction &MF = DAG.getMachineFunction();
16831 bool SplitStack = MF.shouldSplitStack();
16832 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
16837 SDNode *Node = Op.getNode();
16838 SDValue Chain = Op.getOperand(0);
16839 SDValue Size = Op.getOperand(1);
16840 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16841 EVT VT = Node->getValueType(0);
16843 // Chain the dynamic stack allocation so that it doesn't modify the stack
16844 // pointer when other instructions are using the stack.
16845 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
16847 bool Is64Bit = Subtarget.is64Bit();
16848 MVT SPTy = getPointerTy(DAG.getDataLayout());
16852 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16853 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16854 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16855 " not tell us which reg is the stack pointer!");
16857 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16858 Chain = SP.getValue(1);
16859 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
16860 unsigned StackAlign = TFI.getStackAlignment();
16861 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16862 if (Align > StackAlign)
16863 Result = DAG.getNode(ISD::AND, dl, VT, Result,
16864 DAG.getConstant(-(uint64_t)Align, dl, VT));
16865 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
16866 } else if (SplitStack) {
16867 MachineRegisterInfo &MRI = MF.getRegInfo();
16870 // The 64 bit implementation of segmented stacks needs to clobber both r10
16871 // r11. This makes it impossible to use it along with nested parameters.
16872 const Function *F = MF.getFunction();
16873 for (const auto &A : F->args()) {
16874 if (A.hasNestAttr())
16875 report_fatal_error("Cannot use segmented stacks with functions that "
16876 "have nested arguments.");
16880 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
16881 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16882 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16883 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16884 DAG.getRegister(Vreg, SPTy));
16886 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16887 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
16888 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
16890 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
16891 unsigned SPReg = RegInfo->getStackRegister();
16892 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16893 Chain = SP.getValue(1);
16896 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16897 DAG.getConstant(-(uint64_t)Align, dl, VT));
16898 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16904 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
16905 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
16907 SDValue Ops[2] = {Result, Chain};
16908 return DAG.getMergeValues(Ops, dl);
16911 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16912 MachineFunction &MF = DAG.getMachineFunction();
16913 auto PtrVT = getPointerTy(MF.getDataLayout());
16914 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16916 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16919 if (!Subtarget.is64Bit() ||
16920 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
16921 // vastart just stores the address of the VarArgsFrameIndex slot into the
16922 // memory location argument.
16923 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16924 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16925 MachinePointerInfo(SV));
16929 // gp_offset (0 - 6 * 8)
16930 // fp_offset (48 - 48 + 8 * 16)
16931 // overflow_arg_area (point to parameters coming in memory).
16933 SmallVector<SDValue, 8> MemOps;
16934 SDValue FIN = Op.getOperand(1);
16936 SDValue Store = DAG.getStore(
16937 Op.getOperand(0), DL,
16938 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
16939 MachinePointerInfo(SV));
16940 MemOps.push_back(Store);
16943 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
16944 Store = DAG.getStore(
16945 Op.getOperand(0), DL,
16946 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
16947 MachinePointerInfo(SV, 4));
16948 MemOps.push_back(Store);
16950 // Store ptr to overflow_arg_area
16951 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
16952 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16954 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
16955 MemOps.push_back(Store);
16957 // Store ptr to reg_save_area.
16958 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
16959 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
16960 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
16961 Store = DAG.getStore(
16962 Op.getOperand(0), DL, RSFIN, FIN,
16963 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
16964 MemOps.push_back(Store);
16965 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16968 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16969 assert(Subtarget.is64Bit() &&
16970 "LowerVAARG only handles 64-bit va_arg!");
16971 assert(Op.getNode()->getNumOperands() == 4);
16973 MachineFunction &MF = DAG.getMachineFunction();
16974 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
16975 // The Win64 ABI uses char* instead of a structure.
16976 return DAG.expandVAArg(Op.getNode());
16978 SDValue Chain = Op.getOperand(0);
16979 SDValue SrcPtr = Op.getOperand(1);
16980 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16981 unsigned Align = Op.getConstantOperandVal(3);
16984 EVT ArgVT = Op.getNode()->getValueType(0);
16985 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16986 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
16989 // Decide which area this value should be read from.
16990 // TODO: Implement the AMD64 ABI in its entirety. This simple
16991 // selection mechanism works only for the basic types.
16992 if (ArgVT == MVT::f80) {
16993 llvm_unreachable("va_arg for f80 not yet implemented");
16994 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16995 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
16996 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16997 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
16999 llvm_unreachable("Unhandled argument type in LowerVAARG");
17002 if (ArgMode == 2) {
17003 // Sanity Check: Make sure using fp_offset makes sense.
17004 assert(!Subtarget.useSoftFloat() &&
17005 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
17006 Subtarget.hasSSE1());
17009 // Insert VAARG_64 node into the DAG
17010 // VAARG_64 returns two values: Variable Argument Address, Chain
17011 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
17012 DAG.getConstant(ArgMode, dl, MVT::i8),
17013 DAG.getConstant(Align, dl, MVT::i32)};
17014 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
17015 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17016 VTs, InstOps, MVT::i64,
17017 MachinePointerInfo(SV),
17019 /*Volatile=*/false,
17021 /*WriteMem=*/true);
17022 Chain = VAARG.getValue(1);
17024 // Load the next argument and return it
17025 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
17028 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
17029 SelectionDAG &DAG) {
17030 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
17031 // where a va_list is still an i8*.
17032 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
17033 if (Subtarget.isCallingConvWin64(
17034 DAG.getMachineFunction().getFunction()->getCallingConv()))
17035 // Probably a Win64 va_copy.
17036 return DAG.expandVACopy(Op.getNode());
17038 SDValue Chain = Op.getOperand(0);
17039 SDValue DstPtr = Op.getOperand(1);
17040 SDValue SrcPtr = Op.getOperand(2);
17041 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17042 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17045 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17046 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
17048 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17051 /// Handle vector element shifts where the shift amount is a constant.
17052 /// Takes immediate version of shift as input.
17053 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
17054 SDValue SrcOp, uint64_t ShiftAmt,
17055 SelectionDAG &DAG) {
17056 MVT ElementType = VT.getVectorElementType();
17058 // Fold this packed shift into its first operand if ShiftAmt is 0.
17062 // Check for ShiftAmt >= element width
17063 if (ShiftAmt >= ElementType.getSizeInBits()) {
17064 if (Opc == X86ISD::VSRAI)
17065 ShiftAmt = ElementType.getSizeInBits() - 1;
17067 return DAG.getConstant(0, dl, VT);
17070 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17071 && "Unknown target vector shift-by-constant node");
17073 // Fold this packed vector shift into a build vector if SrcOp is a
17074 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17075 if (VT == SrcOp.getSimpleValueType() &&
17076 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17077 SmallVector<SDValue, 8> Elts;
17078 unsigned NumElts = SrcOp->getNumOperands();
17079 ConstantSDNode *ND;
17082 default: llvm_unreachable("Unknown opcode!");
17083 case X86ISD::VSHLI:
17084 for (unsigned i=0; i!=NumElts; ++i) {
17085 SDValue CurrentOp = SrcOp->getOperand(i);
17086 if (CurrentOp->isUndef()) {
17087 Elts.push_back(CurrentOp);
17090 ND = cast<ConstantSDNode>(CurrentOp);
17091 const APInt &C = ND->getAPIntValue();
17092 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
17095 case X86ISD::VSRLI:
17096 for (unsigned i=0; i!=NumElts; ++i) {
17097 SDValue CurrentOp = SrcOp->getOperand(i);
17098 if (CurrentOp->isUndef()) {
17099 Elts.push_back(CurrentOp);
17102 ND = cast<ConstantSDNode>(CurrentOp);
17103 const APInt &C = ND->getAPIntValue();
17104 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
17107 case X86ISD::VSRAI:
17108 for (unsigned i=0; i!=NumElts; ++i) {
17109 SDValue CurrentOp = SrcOp->getOperand(i);
17110 if (CurrentOp->isUndef()) {
17111 Elts.push_back(CurrentOp);
17114 ND = cast<ConstantSDNode>(CurrentOp);
17115 const APInt &C = ND->getAPIntValue();
17116 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
17121 return DAG.getBuildVector(VT, dl, Elts);
17124 return DAG.getNode(Opc, dl, VT, SrcOp,
17125 DAG.getConstant(ShiftAmt, dl, MVT::i8));
17128 /// Handle vector element shifts where the shift amount may or may not be a
17129 /// constant. Takes immediate version of shift as input.
17130 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
17131 SDValue SrcOp, SDValue ShAmt,
17132 SelectionDAG &DAG) {
17133 MVT SVT = ShAmt.getSimpleValueType();
17134 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17136 // Catch shift-by-constant.
17137 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17138 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17139 CShAmt->getZExtValue(), DAG);
17141 // Change opcode to non-immediate version
17143 default: llvm_unreachable("Unknown target vector shift node");
17144 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17145 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17146 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17149 const X86Subtarget &Subtarget =
17150 static_cast<const X86Subtarget &>(DAG.getSubtarget());
17151 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17152 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17153 // Let the shuffle legalizer expand this shift amount node.
17154 SDValue Op0 = ShAmt.getOperand(0);
17155 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17156 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
17158 // Need to build a vector containing shift amount.
17159 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17160 SmallVector<SDValue, 4> ShOps;
17161 ShOps.push_back(ShAmt);
17162 if (SVT == MVT::i32) {
17163 ShOps.push_back(DAG.getConstant(0, dl, SVT));
17164 ShOps.push_back(DAG.getUNDEF(SVT));
17166 ShOps.push_back(DAG.getUNDEF(SVT));
17168 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17169 ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
17172 // The return type has to be a 128-bit type with the same element
17173 // type as the input type.
17174 MVT EltVT = VT.getVectorElementType();
17175 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17177 ShAmt = DAG.getBitcast(ShVT, ShAmt);
17178 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17181 /// \brief Return Mask with the necessary casting or extending
17182 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
17183 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
17184 const X86Subtarget &Subtarget, SelectionDAG &DAG,
17187 if (isAllOnesConstant(Mask))
17188 return DAG.getTargetConstant(1, dl, MaskVT);
17189 if (X86::isZeroNode(Mask))
17190 return DAG.getTargetConstant(0, dl, MaskVT);
17192 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
17193 // Mask should be extended
17194 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
17195 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
17198 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
17199 if (MaskVT == MVT::v64i1) {
17200 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
17201 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
17203 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17204 DAG.getConstant(0, dl, MVT::i32));
17205 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17206 DAG.getConstant(1, dl, MVT::i32));
17208 Lo = DAG.getBitcast(MVT::v32i1, Lo);
17209 Hi = DAG.getBitcast(MVT::v32i1, Hi);
17211 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
17213 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
17215 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
17216 return DAG.getBitcast(MaskVT,
17217 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
17221 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17222 Mask.getSimpleValueType().getSizeInBits());
17223 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17224 // are extracted by EXTRACT_SUBVECTOR.
17225 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17226 DAG.getBitcast(BitcastVT, Mask),
17227 DAG.getIntPtrConstant(0, dl));
17231 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17232 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17233 /// necessary casting or extending for \p Mask when lowering masking intrinsics
17234 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17235 SDValue PreservedSrc,
17236 const X86Subtarget &Subtarget,
17237 SelectionDAG &DAG) {
17238 MVT VT = Op.getSimpleValueType();
17239 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17240 unsigned OpcodeSelect = ISD::VSELECT;
17243 if (isAllOnesConstant(Mask))
17246 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17248 switch (Op.getOpcode()) {
17250 case X86ISD::PCMPEQM:
17251 case X86ISD::PCMPGTM:
17253 case X86ISD::CMPMU:
17254 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17255 case X86ISD::VFPCLASS:
17256 case X86ISD::VFPCLASSS:
17257 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
17258 case X86ISD::VTRUNC:
17259 case X86ISD::VTRUNCS:
17260 case X86ISD::VTRUNCUS:
17261 case ISD::FP_TO_FP16:
17262 // We can't use ISD::VSELECT here because it is not always "Legal"
17263 // for the destination type. For example vpmovqb require only AVX512
17264 // and vselect that can operate on byte element type require BWI
17265 OpcodeSelect = X86ISD::SELECT;
17268 if (PreservedSrc.isUndef())
17269 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17270 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
17273 /// \brief Creates an SDNode for a predicated scalar operation.
17274 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17275 /// The mask is coming as MVT::i8 and it should be truncated
17276 /// to MVT::i1 while lowering masking intrinsics.
17277 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17278 /// "X86select" instead of "vselect". We just can't create the "vselect" node
17279 /// for a scalar instruction.
17280 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17281 SDValue PreservedSrc,
17282 const X86Subtarget &Subtarget,
17283 SelectionDAG &DAG) {
17284 if (isAllOnesConstant(Mask))
17287 MVT VT = Op.getSimpleValueType();
17289 // The mask should be of type MVT::i1
17290 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17292 if (Op.getOpcode() == X86ISD::FSETCC)
17293 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
17294 if (Op.getOpcode() == X86ISD::VFPCLASS ||
17295 Op.getOpcode() == X86ISD::VFPCLASSS)
17296 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
17298 if (PreservedSrc.isUndef())
17299 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17300 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17303 static int getSEHRegistrationNodeSize(const Function *Fn) {
17304 if (!Fn->hasPersonalityFn())
17305 report_fatal_error(
17306 "querying registration node size for function without personality");
17307 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
17308 // WinEHStatePass for the full struct definition.
17309 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
17310 case EHPersonality::MSVC_X86SEH: return 24;
17311 case EHPersonality::MSVC_CXX: return 16;
17314 report_fatal_error(
17315 "can only recover FP for 32-bit MSVC EH personality functions");
17318 /// When the MSVC runtime transfers control to us, either to an outlined
17319 /// function or when returning to a parent frame after catching an exception, we
17320 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
17321 /// Here's the math:
17322 /// RegNodeBase = EntryEBP - RegNodeSize
17323 /// ParentFP = RegNodeBase - ParentFrameOffset
17324 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
17325 /// subtracting the offset (negative on x86) takes us back to the parent FP.
17326 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
17327 SDValue EntryEBP) {
17328 MachineFunction &MF = DAG.getMachineFunction();
17331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17332 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
17334 // It's possible that the parent function no longer has a personality function
17335 // if the exceptional code was optimized away, in which case we just return
17336 // the incoming EBP.
17337 if (!Fn->hasPersonalityFn())
17340 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
17341 // registration, or the .set_setframe offset.
17342 MCSymbol *OffsetSym =
17343 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
17344 GlobalValue::getRealLinkageName(Fn->getName()));
17345 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
17346 SDValue ParentFrameOffset =
17347 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
17349 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
17350 // prologue to RBP in the parent function.
17351 const X86Subtarget &Subtarget =
17352 static_cast<const X86Subtarget &>(DAG.getSubtarget());
17353 if (Subtarget.is64Bit())
17354 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
17356 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
17357 // RegNodeBase = EntryEBP - RegNodeSize
17358 // ParentFP = RegNodeBase - ParentFrameOffset
17359 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
17360 DAG.getConstant(RegNodeSize, dl, PtrVT));
17361 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
17364 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
17365 SelectionDAG &DAG) {
17367 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17368 MVT VT = Op.getSimpleValueType();
17369 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17371 switch(IntrData->Type) {
17372 case INTR_TYPE_1OP:
17373 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17374 case INTR_TYPE_2OP:
17375 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17377 case INTR_TYPE_2OP_IMM8:
17378 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17379 DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
17380 case INTR_TYPE_3OP:
17381 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17382 Op.getOperand(2), Op.getOperand(3));
17383 case INTR_TYPE_4OP:
17384 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17385 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
17386 case INTR_TYPE_1OP_MASK_RM: {
17387 SDValue Src = Op.getOperand(1);
17388 SDValue PassThru = Op.getOperand(2);
17389 SDValue Mask = Op.getOperand(3);
17390 SDValue RoundingMode;
17391 // We allways add rounding mode to the Node.
17392 // If the rounding mode is not specified, we add the
17393 // "current direction" mode.
17394 if (Op.getNumOperands() == 4)
17396 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17398 RoundingMode = Op.getOperand(4);
17399 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17400 if (IntrWithRoundingModeOpcode != 0)
17401 if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
17402 X86::STATIC_ROUNDING::CUR_DIRECTION)
17403 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17404 dl, Op.getValueType(), Src, RoundingMode),
17405 Mask, PassThru, Subtarget, DAG);
17406 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17408 Mask, PassThru, Subtarget, DAG);
17410 case INTR_TYPE_1OP_MASK: {
17411 SDValue Src = Op.getOperand(1);
17412 SDValue PassThru = Op.getOperand(2);
17413 SDValue Mask = Op.getOperand(3);
17414 // We add rounding mode to the Node when
17415 // - RM Opcode is specified and
17416 // - RM is not "current direction".
17417 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17418 if (IntrWithRoundingModeOpcode != 0) {
17419 SDValue Rnd = Op.getOperand(4);
17420 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17421 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17422 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17423 dl, Op.getValueType(),
17425 Mask, PassThru, Subtarget, DAG);
17428 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17429 Mask, PassThru, Subtarget, DAG);
17431 case INTR_TYPE_SCALAR_MASK: {
17432 SDValue Src1 = Op.getOperand(1);
17433 SDValue Src2 = Op.getOperand(2);
17434 SDValue passThru = Op.getOperand(3);
17435 SDValue Mask = Op.getOperand(4);
17436 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
17437 Mask, passThru, Subtarget, DAG);
17439 case INTR_TYPE_SCALAR_MASK_RM: {
17440 SDValue Src1 = Op.getOperand(1);
17441 SDValue Src2 = Op.getOperand(2);
17442 SDValue Src0 = Op.getOperand(3);
17443 SDValue Mask = Op.getOperand(4);
17444 // There are 2 kinds of intrinsics in this group:
17445 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
17446 // (2) With rounding mode and sae - 7 operands.
17447 if (Op.getNumOperands() == 6) {
17448 SDValue Sae = Op.getOperand(5);
17449 unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
17450 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
17452 Mask, Src0, Subtarget, DAG);
17454 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
17455 SDValue RoundingMode = Op.getOperand(5);
17456 SDValue Sae = Op.getOperand(6);
17457 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17458 RoundingMode, Sae),
17459 Mask, Src0, Subtarget, DAG);
17461 case INTR_TYPE_2OP_MASK:
17462 case INTR_TYPE_2OP_IMM8_MASK: {
17463 SDValue Src1 = Op.getOperand(1);
17464 SDValue Src2 = Op.getOperand(2);
17465 SDValue PassThru = Op.getOperand(3);
17466 SDValue Mask = Op.getOperand(4);
17468 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
17469 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
17471 // We specify 2 possible opcodes for intrinsics with rounding modes.
17472 // First, we check if the intrinsic may have non-default rounding mode,
17473 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17474 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17475 if (IntrWithRoundingModeOpcode != 0) {
17476 SDValue Rnd = Op.getOperand(5);
17477 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17478 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17479 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17480 dl, Op.getValueType(),
17482 Mask, PassThru, Subtarget, DAG);
17485 // TODO: Intrinsics should have fast-math-flags to propagate.
17486 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
17487 Mask, PassThru, Subtarget, DAG);
17489 case INTR_TYPE_2OP_MASK_RM: {
17490 SDValue Src1 = Op.getOperand(1);
17491 SDValue Src2 = Op.getOperand(2);
17492 SDValue PassThru = Op.getOperand(3);
17493 SDValue Mask = Op.getOperand(4);
17494 // We specify 2 possible modes for intrinsics, with/without rounding
17496 // First, we check if the intrinsic have rounding mode (6 operands),
17497 // if not, we set rounding mode to "current".
17499 if (Op.getNumOperands() == 6)
17500 Rnd = Op.getOperand(5);
17502 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17503 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17505 Mask, PassThru, Subtarget, DAG);
17507 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
17508 SDValue Src1 = Op.getOperand(1);
17509 SDValue Src2 = Op.getOperand(2);
17510 SDValue Src3 = Op.getOperand(3);
17511 SDValue PassThru = Op.getOperand(4);
17512 SDValue Mask = Op.getOperand(5);
17513 SDValue Sae = Op.getOperand(6);
17515 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
17517 Mask, PassThru, Subtarget, DAG);
17519 case INTR_TYPE_3OP_MASK_RM: {
17520 SDValue Src1 = Op.getOperand(1);
17521 SDValue Src2 = Op.getOperand(2);
17522 SDValue Imm = Op.getOperand(3);
17523 SDValue PassThru = Op.getOperand(4);
17524 SDValue Mask = Op.getOperand(5);
17525 // We specify 2 possible modes for intrinsics, with/without rounding
17527 // First, we check if the intrinsic have rounding mode (7 operands),
17528 // if not, we set rounding mode to "current".
17530 if (Op.getNumOperands() == 7)
17531 Rnd = Op.getOperand(6);
17533 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17534 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17535 Src1, Src2, Imm, Rnd),
17536 Mask, PassThru, Subtarget, DAG);
17538 case INTR_TYPE_3OP_IMM8_MASK:
17539 case INTR_TYPE_3OP_MASK:
17540 case INSERT_SUBVEC: {
17541 SDValue Src1 = Op.getOperand(1);
17542 SDValue Src2 = Op.getOperand(2);
17543 SDValue Src3 = Op.getOperand(3);
17544 SDValue PassThru = Op.getOperand(4);
17545 SDValue Mask = Op.getOperand(5);
17547 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
17548 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
17549 else if (IntrData->Type == INSERT_SUBVEC) {
17550 // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
17551 assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
17552 unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
17553 Imm *= Src2.getSimpleValueType().getVectorNumElements();
17554 Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
17557 // We specify 2 possible opcodes for intrinsics with rounding modes.
17558 // First, we check if the intrinsic may have non-default rounding mode,
17559 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17560 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17561 if (IntrWithRoundingModeOpcode != 0) {
17562 SDValue Rnd = Op.getOperand(6);
17563 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17564 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17565 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17566 dl, Op.getValueType(),
17567 Src1, Src2, Src3, Rnd),
17568 Mask, PassThru, Subtarget, DAG);
17571 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17573 Mask, PassThru, Subtarget, DAG);
17575 case VPERM_2OP_MASK : {
17576 SDValue Src1 = Op.getOperand(1);
17577 SDValue Src2 = Op.getOperand(2);
17578 SDValue PassThru = Op.getOperand(3);
17579 SDValue Mask = Op.getOperand(4);
17581 // Swap Src1 and Src2 in the node creation
17582 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
17583 Mask, PassThru, Subtarget, DAG);
17585 case VPERM_3OP_MASKZ:
17586 case VPERM_3OP_MASK:{
17587 // Src2 is the PassThru
17588 SDValue Src1 = Op.getOperand(1);
17589 SDValue Src2 = Op.getOperand(2);
17590 SDValue Src3 = Op.getOperand(3);
17591 SDValue Mask = Op.getOperand(4);
17592 MVT VT = Op.getSimpleValueType();
17593 SDValue PassThru = SDValue();
17595 // set PassThru element
17596 if (IntrData->Type == VPERM_3OP_MASKZ)
17597 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17599 PassThru = DAG.getBitcast(VT, Src2);
17601 // Swap Src1 and Src2 in the node creation
17602 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17603 dl, Op.getValueType(),
17605 Mask, PassThru, Subtarget, DAG);
17609 case FMA_OP_MASK: {
17610 SDValue Src1 = Op.getOperand(1);
17611 SDValue Src2 = Op.getOperand(2);
17612 SDValue Src3 = Op.getOperand(3);
17613 SDValue Mask = Op.getOperand(4);
17614 MVT VT = Op.getSimpleValueType();
17615 SDValue PassThru = SDValue();
17617 // set PassThru element
17618 if (IntrData->Type == FMA_OP_MASKZ)
17619 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17620 else if (IntrData->Type == FMA_OP_MASK3)
17625 // We specify 2 possible opcodes for intrinsics with rounding modes.
17626 // First, we check if the intrinsic may have non-default rounding mode,
17627 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17628 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17629 if (IntrWithRoundingModeOpcode != 0) {
17630 SDValue Rnd = Op.getOperand(5);
17631 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17632 X86::STATIC_ROUNDING::CUR_DIRECTION)
17633 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17634 dl, Op.getValueType(),
17635 Src1, Src2, Src3, Rnd),
17636 Mask, PassThru, Subtarget, DAG);
17638 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17639 dl, Op.getValueType(),
17641 Mask, PassThru, Subtarget, DAG);
17643 case FMA_OP_SCALAR_MASK:
17644 case FMA_OP_SCALAR_MASK3:
17645 case FMA_OP_SCALAR_MASKZ: {
17646 SDValue Src1 = Op.getOperand(1);
17647 SDValue Src2 = Op.getOperand(2);
17648 SDValue Src3 = Op.getOperand(3);
17649 SDValue Mask = Op.getOperand(4);
17650 MVT VT = Op.getSimpleValueType();
17651 SDValue PassThru = SDValue();
17653 // set PassThru element
17654 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
17655 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17656 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
17661 SDValue Rnd = Op.getOperand(5);
17662 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
17663 Op.getValueType(), Src1, Src2,
17665 Mask, PassThru, Subtarget, DAG);
17667 case TERLOG_OP_MASK:
17668 case TERLOG_OP_MASKZ: {
17669 SDValue Src1 = Op.getOperand(1);
17670 SDValue Src2 = Op.getOperand(2);
17671 SDValue Src3 = Op.getOperand(3);
17672 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
17673 SDValue Mask = Op.getOperand(5);
17674 MVT VT = Op.getSimpleValueType();
17675 SDValue PassThru = Src1;
17676 // Set PassThru element.
17677 if (IntrData->Type == TERLOG_OP_MASKZ)
17678 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17680 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17681 Src1, Src2, Src3, Src4),
17682 Mask, PassThru, Subtarget, DAG);
17685 // FPclass intrinsics with mask
17686 SDValue Src1 = Op.getOperand(1);
17687 MVT VT = Src1.getSimpleValueType();
17688 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17689 SDValue Imm = Op.getOperand(2);
17690 SDValue Mask = Op.getOperand(3);
17691 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17692 Mask.getSimpleValueType().getSizeInBits());
17693 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
17694 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
17695 DAG.getTargetConstant(0, dl, MaskVT),
17697 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17698 DAG.getUNDEF(BitcastVT), FPclassMask,
17699 DAG.getIntPtrConstant(0, dl));
17700 return DAG.getBitcast(Op.getValueType(), Res);
17703 SDValue Src1 = Op.getOperand(1);
17704 SDValue Imm = Op.getOperand(2);
17705 SDValue Mask = Op.getOperand(3);
17706 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
17707 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
17708 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
17709 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
17712 case CMP_MASK_CC: {
17713 // Comparison intrinsics with masks.
17714 // Example of transformation:
17715 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17716 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17718 // (v8i1 (insert_subvector undef,
17719 // (v2i1 (and (PCMPEQM %a, %b),
17720 // (extract_subvector
17721 // (v8i1 (bitcast %mask)), 0))), 0))))
17722 MVT VT = Op.getOperand(1).getSimpleValueType();
17723 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17724 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17725 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17726 Mask.getSimpleValueType().getSizeInBits());
17728 if (IntrData->Type == CMP_MASK_CC) {
17729 SDValue CC = Op.getOperand(3);
17730 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
17731 // We specify 2 possible opcodes for intrinsics with rounding modes.
17732 // First, we check if the intrinsic may have non-default rounding mode,
17733 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17734 if (IntrData->Opc1 != 0) {
17735 SDValue Rnd = Op.getOperand(5);
17736 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17737 X86::STATIC_ROUNDING::CUR_DIRECTION)
17738 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
17739 Op.getOperand(2), CC, Rnd);
17741 //default rounding mode
17743 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17744 Op.getOperand(2), CC);
17747 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17748 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17751 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17752 DAG.getTargetConstant(0, dl,
17755 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17756 DAG.getUNDEF(BitcastVT), CmpMask,
17757 DAG.getIntPtrConstant(0, dl));
17758 return DAG.getBitcast(Op.getValueType(), Res);
17760 case CMP_MASK_SCALAR_CC: {
17761 SDValue Src1 = Op.getOperand(1);
17762 SDValue Src2 = Op.getOperand(2);
17763 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
17764 SDValue Mask = Op.getOperand(4);
17767 if (IntrData->Opc1 != 0) {
17768 SDValue Rnd = Op.getOperand(5);
17769 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17770 X86::STATIC_ROUNDING::CUR_DIRECTION)
17771 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
17773 //default rounding mode
17775 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
17777 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
17778 DAG.getTargetConstant(0, dl,
17782 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
17784 case COMI: { // Comparison intrinsics
17785 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17786 SDValue LHS = Op.getOperand(1);
17787 SDValue RHS = Op.getOperand(2);
17788 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17789 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
17792 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
17793 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17794 DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
17795 SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17796 DAG.getConstant(X86::COND_NP, dl, MVT::i8),
17798 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
17801 case ISD::SETNE: { // (ZF = 1 or PF = 1)
17802 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17803 DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
17804 SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17805 DAG.getConstant(X86::COND_P, dl, MVT::i8),
17807 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
17810 case ISD::SETGT: // (CF = 0 and ZF = 0)
17811 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17812 DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
17814 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
17815 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17816 DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
17819 case ISD::SETGE: // CF = 0
17820 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17821 DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
17823 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
17824 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17825 DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
17828 llvm_unreachable("Unexpected illegal condition!");
17830 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17832 case COMI_RM: { // Comparison intrinsics with Sae
17833 SDValue LHS = Op.getOperand(1);
17834 SDValue RHS = Op.getOperand(2);
17835 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
17836 SDValue Sae = Op.getOperand(4);
17839 if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
17840 X86::STATIC_ROUNDING::CUR_DIRECTION)
17841 FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17842 DAG.getConstant(CondVal, dl, MVT::i8));
17844 FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17845 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
17846 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
17847 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
17850 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17851 Op.getOperand(1), Op.getOperand(2), DAG);
17852 case COMPRESS_EXPAND_IN_REG: {
17853 SDValue Mask = Op.getOperand(3);
17854 SDValue DataToCompress = Op.getOperand(1);
17855 SDValue PassThru = Op.getOperand(2);
17856 if (isAllOnesConstant(Mask)) // return data as is
17857 return Op.getOperand(1);
17859 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17861 Mask, PassThru, Subtarget, DAG);
17864 SDValue Mask = Op.getOperand(1);
17865 MVT MaskVT = MVT::getVectorVT(MVT::i1,
17866 Mask.getSimpleValueType().getSizeInBits());
17867 Mask = DAG.getBitcast(MaskVT, Mask);
17868 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
17871 MVT VT = Op.getSimpleValueType();
17872 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
17874 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
17875 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
17876 // Arguments should be swapped.
17877 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
17878 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
17880 return DAG.getBitcast(VT, Res);
17883 case FIXUPIMMS_MASKZ:
17885 case FIXUPIMM_MASKZ:{
17886 SDValue Src1 = Op.getOperand(1);
17887 SDValue Src2 = Op.getOperand(2);
17888 SDValue Src3 = Op.getOperand(3);
17889 SDValue Imm = Op.getOperand(4);
17890 SDValue Mask = Op.getOperand(5);
17891 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
17892 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
17893 // We specify 2 possible modes for intrinsics, with/without rounding
17895 // First, we check if the intrinsic have rounding mode (7 operands),
17896 // if not, we set rounding mode to "current".
17898 if (Op.getNumOperands() == 7)
17899 Rnd = Op.getOperand(6);
17901 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17902 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
17903 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17904 Src1, Src2, Src3, Imm, Rnd),
17905 Mask, Passthru, Subtarget, DAG);
17906 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
17907 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17908 Src1, Src2, Src3, Imm, Rnd),
17909 Mask, Passthru, Subtarget, DAG);
17911 case CONVERT_TO_MASK: {
17912 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
17913 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
17914 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
17916 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
17918 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17919 DAG.getUNDEF(BitcastVT), CvtMask,
17920 DAG.getIntPtrConstant(0, dl));
17921 return DAG.getBitcast(Op.getValueType(), Res);
17923 case CONVERT_MASK_TO_VEC: {
17924 SDValue Mask = Op.getOperand(1);
17925 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17926 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17927 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
17929 case BRCST_SUBVEC_TO_VEC: {
17930 SDValue Src = Op.getOperand(1);
17931 SDValue Passthru = Op.getOperand(2);
17932 SDValue Mask = Op.getOperand(3);
17933 EVT resVT = Passthru.getValueType();
17934 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
17935 DAG.getUNDEF(resVT), Src,
17936 DAG.getIntPtrConstant(0, dl));
17938 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
17939 immVal = DAG.getConstant(0x44, dl, MVT::i8);
17941 immVal = DAG.getConstant(0, dl, MVT::i8);
17942 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17943 subVec, subVec, immVal),
17944 Mask, Passthru, Subtarget, DAG);
17946 case BRCST32x2_TO_VEC: {
17947 SDValue Src = Op.getOperand(1);
17948 SDValue PassThru = Op.getOperand(2);
17949 SDValue Mask = Op.getOperand(3);
17951 assert((VT.getScalarType() == MVT::i32 ||
17952 VT.getScalarType() == MVT::f32) && "Unexpected type!");
17953 //bitcast Src to packed 64
17954 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
17955 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
17956 Src = DAG.getBitcast(BitcastVT, Src);
17958 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17959 Mask, PassThru, Subtarget, DAG);
17967 default: return SDValue(); // Don't custom lower most intrinsics.
17969 case Intrinsic::x86_avx2_permd:
17970 case Intrinsic::x86_avx2_permps:
17971 // Operands intentionally swapped. Mask is last operand to intrinsic,
17972 // but second operand for node/instruction.
17973 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
17974 Op.getOperand(2), Op.getOperand(1));
17976 // ptest and testp intrinsics. The intrinsic these come from are designed to
17977 // return an integer value, not just an instruction so lower it to the ptest
17978 // or testp pattern and a setcc for the result.
17979 case Intrinsic::x86_sse41_ptestz:
17980 case Intrinsic::x86_sse41_ptestc:
17981 case Intrinsic::x86_sse41_ptestnzc:
17982 case Intrinsic::x86_avx_ptestz_256:
17983 case Intrinsic::x86_avx_ptestc_256:
17984 case Intrinsic::x86_avx_ptestnzc_256:
17985 case Intrinsic::x86_avx_vtestz_ps:
17986 case Intrinsic::x86_avx_vtestc_ps:
17987 case Intrinsic::x86_avx_vtestnzc_ps:
17988 case Intrinsic::x86_avx_vtestz_pd:
17989 case Intrinsic::x86_avx_vtestc_pd:
17990 case Intrinsic::x86_avx_vtestnzc_pd:
17991 case Intrinsic::x86_avx_vtestz_ps_256:
17992 case Intrinsic::x86_avx_vtestc_ps_256:
17993 case Intrinsic::x86_avx_vtestnzc_ps_256:
17994 case Intrinsic::x86_avx_vtestz_pd_256:
17995 case Intrinsic::x86_avx_vtestc_pd_256:
17996 case Intrinsic::x86_avx_vtestnzc_pd_256: {
17997 bool IsTestPacked = false;
18000 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
18001 case Intrinsic::x86_avx_vtestz_ps:
18002 case Intrinsic::x86_avx_vtestz_pd:
18003 case Intrinsic::x86_avx_vtestz_ps_256:
18004 case Intrinsic::x86_avx_vtestz_pd_256:
18005 IsTestPacked = true; // Fallthrough
18006 case Intrinsic::x86_sse41_ptestz:
18007 case Intrinsic::x86_avx_ptestz_256:
18009 X86CC = X86::COND_E;
18011 case Intrinsic::x86_avx_vtestc_ps:
18012 case Intrinsic::x86_avx_vtestc_pd:
18013 case Intrinsic::x86_avx_vtestc_ps_256:
18014 case Intrinsic::x86_avx_vtestc_pd_256:
18015 IsTestPacked = true; // Fallthrough
18016 case Intrinsic::x86_sse41_ptestc:
18017 case Intrinsic::x86_avx_ptestc_256:
18019 X86CC = X86::COND_B;
18021 case Intrinsic::x86_avx_vtestnzc_ps:
18022 case Intrinsic::x86_avx_vtestnzc_pd:
18023 case Intrinsic::x86_avx_vtestnzc_ps_256:
18024 case Intrinsic::x86_avx_vtestnzc_pd_256:
18025 IsTestPacked = true; // Fallthrough
18026 case Intrinsic::x86_sse41_ptestnzc:
18027 case Intrinsic::x86_avx_ptestnzc_256:
18029 X86CC = X86::COND_A;
18033 SDValue LHS = Op.getOperand(1);
18034 SDValue RHS = Op.getOperand(2);
18035 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
18036 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
18037 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18038 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18039 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18041 case Intrinsic::x86_avx512_kortestz_w:
18042 case Intrinsic::x86_avx512_kortestc_w: {
18043 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
18044 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
18045 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
18046 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18047 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18048 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18049 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18052 case Intrinsic::x86_sse42_pcmpistria128:
18053 case Intrinsic::x86_sse42_pcmpestria128:
18054 case Intrinsic::x86_sse42_pcmpistric128:
18055 case Intrinsic::x86_sse42_pcmpestric128:
18056 case Intrinsic::x86_sse42_pcmpistrio128:
18057 case Intrinsic::x86_sse42_pcmpestrio128:
18058 case Intrinsic::x86_sse42_pcmpistris128:
18059 case Intrinsic::x86_sse42_pcmpestris128:
18060 case Intrinsic::x86_sse42_pcmpistriz128:
18061 case Intrinsic::x86_sse42_pcmpestriz128: {
18065 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
18066 case Intrinsic::x86_sse42_pcmpistria128:
18067 Opcode = X86ISD::PCMPISTRI;
18068 X86CC = X86::COND_A;
18070 case Intrinsic::x86_sse42_pcmpestria128:
18071 Opcode = X86ISD::PCMPESTRI;
18072 X86CC = X86::COND_A;
18074 case Intrinsic::x86_sse42_pcmpistric128:
18075 Opcode = X86ISD::PCMPISTRI;
18076 X86CC = X86::COND_B;
18078 case Intrinsic::x86_sse42_pcmpestric128:
18079 Opcode = X86ISD::PCMPESTRI;
18080 X86CC = X86::COND_B;
18082 case Intrinsic::x86_sse42_pcmpistrio128:
18083 Opcode = X86ISD::PCMPISTRI;
18084 X86CC = X86::COND_O;
18086 case Intrinsic::x86_sse42_pcmpestrio128:
18087 Opcode = X86ISD::PCMPESTRI;
18088 X86CC = X86::COND_O;
18090 case Intrinsic::x86_sse42_pcmpistris128:
18091 Opcode = X86ISD::PCMPISTRI;
18092 X86CC = X86::COND_S;
18094 case Intrinsic::x86_sse42_pcmpestris128:
18095 Opcode = X86ISD::PCMPESTRI;
18096 X86CC = X86::COND_S;
18098 case Intrinsic::x86_sse42_pcmpistriz128:
18099 Opcode = X86ISD::PCMPISTRI;
18100 X86CC = X86::COND_E;
18102 case Intrinsic::x86_sse42_pcmpestriz128:
18103 Opcode = X86ISD::PCMPESTRI;
18104 X86CC = X86::COND_E;
18107 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18108 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18109 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
18110 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18111 DAG.getConstant(X86CC, dl, MVT::i8),
18112 SDValue(PCMP.getNode(), 1));
18113 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18116 case Intrinsic::x86_sse42_pcmpistri128:
18117 case Intrinsic::x86_sse42_pcmpestri128: {
18119 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
18120 Opcode = X86ISD::PCMPISTRI;
18122 Opcode = X86ISD::PCMPESTRI;
18124 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18125 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18126 return DAG.getNode(Opcode, dl, VTs, NewOps);
18129 case Intrinsic::eh_sjlj_lsda: {
18130 MachineFunction &MF = DAG.getMachineFunction();
18131 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18132 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18133 auto &Context = MF.getMMI().getContext();
18134 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
18135 Twine(MF.getFunctionNumber()));
18136 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
18139 case Intrinsic::x86_seh_lsda: {
18140 // Compute the symbol for the LSDA. We know it'll get emitted later.
18141 MachineFunction &MF = DAG.getMachineFunction();
18142 SDValue Op1 = Op.getOperand(1);
18143 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
18144 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
18145 GlobalValue::getRealLinkageName(Fn->getName()));
18147 // Generate a simple absolute symbol reference. This intrinsic is only
18148 // supported on 32-bit Windows, which isn't PIC.
18149 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
18150 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
18153 case Intrinsic::x86_seh_recoverfp: {
18154 SDValue FnOp = Op.getOperand(1);
18155 SDValue IncomingFPOp = Op.getOperand(2);
18156 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
18157 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
18159 report_fatal_error(
18160 "llvm.x86.seh.recoverfp must take a function as the first argument");
18161 return recoverFramePointer(DAG, Fn, IncomingFPOp);
18164 case Intrinsic::localaddress: {
18165 // Returns one of the stack, base, or frame pointer registers, depending on
18166 // which is used to reference local variables.
18167 MachineFunction &MF = DAG.getMachineFunction();
18168 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18170 if (RegInfo->hasBasePointer(MF))
18171 Reg = RegInfo->getBaseRegister();
18172 else // This function handles the SP or FP case.
18173 Reg = RegInfo->getPtrSizedFrameRegister(MF);
18174 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
18179 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18180 SDValue Src, SDValue Mask, SDValue Base,
18181 SDValue Index, SDValue ScaleOp, SDValue Chain,
18182 const X86Subtarget &Subtarget) {
18184 auto *C = cast<ConstantSDNode>(ScaleOp);
18185 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18186 MVT MaskVT = MVT::getVectorVT(MVT::i1,
18187 Index.getSimpleValueType().getVectorNumElements());
18189 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18190 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
18191 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18192 SDValue Segment = DAG.getRegister(0, MVT::i32);
18194 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
18195 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
18196 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18197 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
18198 return DAG.getMergeValues(RetOps, dl);
18201 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18202 SDValue Src, SDValue Mask, SDValue Base,
18203 SDValue Index, SDValue ScaleOp, SDValue Chain,
18204 const X86Subtarget &Subtarget) {
18206 auto *C = cast<ConstantSDNode>(ScaleOp);
18207 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18208 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18209 SDValue Segment = DAG.getRegister(0, MVT::i32);
18210 MVT MaskVT = MVT::getVectorVT(MVT::i1,
18211 Index.getSimpleValueType().getVectorNumElements());
18213 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18214 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
18215 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
18216 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18217 return SDValue(Res, 1);
18220 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18221 SDValue Mask, SDValue Base, SDValue Index,
18222 SDValue ScaleOp, SDValue Chain,
18223 const X86Subtarget &Subtarget) {
18225 auto *C = cast<ConstantSDNode>(ScaleOp);
18226 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18227 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18228 SDValue Segment = DAG.getRegister(0, MVT::i32);
18230 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
18231 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18232 //SDVTList VTs = DAG.getVTList(MVT::Other);
18233 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
18234 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
18235 return SDValue(Res, 0);
18238 /// Handles the lowering of builtin intrinsics that read performance monitor
18239 /// counters (x86_rdpmc).
18240 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
18242 const X86Subtarget &Subtarget,
18243 SmallVectorImpl<SDValue> &Results) {
18244 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18245 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18248 // The ECX register is used to select the index of the performance counter
18250 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
18252 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
18254 // Reads the content of a 64-bit performance counter and returns it in the
18255 // registers EDX:EAX.
18256 if (Subtarget.is64Bit()) {
18257 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18258 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18261 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18262 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18265 Chain = HI.getValue(1);
18267 if (Subtarget.is64Bit()) {
18268 // The EAX register is loaded with the low-order 32 bits. The EDX register
18269 // is loaded with the supported high-order bits of the counter.
18270 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18271 DAG.getConstant(32, DL, MVT::i8));
18272 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18273 Results.push_back(Chain);
18277 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18278 SDValue Ops[] = { LO, HI };
18279 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18280 Results.push_back(Pair);
18281 Results.push_back(Chain);
18284 /// Handles the lowering of builtin intrinsics that read the time stamp counter
18285 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
18286 /// READCYCLECOUNTER nodes.
18287 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
18289 const X86Subtarget &Subtarget,
18290 SmallVectorImpl<SDValue> &Results) {
18291 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18292 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
18295 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
18296 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
18297 // and the EAX register is loaded with the low-order 32 bits.
18298 if (Subtarget.is64Bit()) {
18299 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18300 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18303 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18304 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18307 SDValue Chain = HI.getValue(1);
18309 if (Opcode == X86ISD::RDTSCP_DAG) {
18310 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18312 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
18313 // the ECX register. Add 'ecx' explicitly to the chain.
18314 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
18316 // Explicitly store the content of ECX at the location passed in input
18317 // to the 'rdtscp' intrinsic.
18318 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
18319 MachinePointerInfo());
18322 if (Subtarget.is64Bit()) {
18323 // The EDX register is loaded with the high-order 32 bits of the MSR, and
18324 // the EAX register is loaded with the low-order 32 bits.
18325 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18326 DAG.getConstant(32, DL, MVT::i8));
18327 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18328 Results.push_back(Chain);
18332 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18333 SDValue Ops[] = { LO, HI };
18334 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18335 Results.push_back(Pair);
18336 Results.push_back(Chain);
18339 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
18340 SelectionDAG &DAG) {
18341 SmallVector<SDValue, 2> Results;
18343 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
18345 return DAG.getMergeValues(Results, DL);
18348 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
18349 MachineFunction &MF = DAG.getMachineFunction();
18350 SDValue Chain = Op.getOperand(0);
18351 SDValue RegNode = Op.getOperand(2);
18352 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18354 report_fatal_error("EH registrations only live in functions using WinEH");
18356 // Cast the operand to an alloca, and remember the frame index.
18357 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
18359 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
18360 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
18362 // Return the chain operand without making any DAG nodes.
18366 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
18367 MachineFunction &MF = DAG.getMachineFunction();
18368 SDValue Chain = Op.getOperand(0);
18369 SDValue EHGuard = Op.getOperand(2);
18370 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18372 report_fatal_error("EHGuard only live in functions using WinEH");
18374 // Cast the operand to an alloca, and remember the frame index.
18375 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
18377 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
18378 EHInfo->EHGuardFrameIndex = FINode->getIndex();
18380 // Return the chain operand without making any DAG nodes.
18384 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18385 SelectionDAG &DAG) {
18386 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
18388 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
18390 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
18391 return MarkEHRegistrationNode(Op, DAG);
18392 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
18393 return MarkEHGuard(Op, DAG);
18394 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
18395 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
18396 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
18397 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
18398 // We need a frame pointer because this will get lowered to a PUSH/POP
18400 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18401 MFI->setHasCopyImplyingStackAdjustment(true);
18402 // Don't do anything here, we will expand these intrinsics out later
18403 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
18410 switch(IntrData->Type) {
18411 default: llvm_unreachable("Unknown Intrinsic Type");
18414 // Emit the node with the right value type.
18415 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
18416 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18418 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
18419 // Otherwise return the value from Rand, which is always 0, casted to i32.
18420 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
18421 DAG.getConstant(1, dl, Op->getValueType(1)),
18422 DAG.getConstant(X86::COND_B, dl, MVT::i32),
18423 SDValue(Result.getNode(), 1) };
18424 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
18425 DAG.getVTList(Op->getValueType(1), MVT::Glue),
18428 // Return { result, isValid, chain }.
18429 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
18430 SDValue(Result.getNode(), 2));
18433 //gather(v1, mask, index, base, scale);
18434 SDValue Chain = Op.getOperand(0);
18435 SDValue Src = Op.getOperand(2);
18436 SDValue Base = Op.getOperand(3);
18437 SDValue Index = Op.getOperand(4);
18438 SDValue Mask = Op.getOperand(5);
18439 SDValue Scale = Op.getOperand(6);
18440 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
18444 //scatter(base, mask, index, v1, scale);
18445 SDValue Chain = Op.getOperand(0);
18446 SDValue Base = Op.getOperand(2);
18447 SDValue Mask = Op.getOperand(3);
18448 SDValue Index = Op.getOperand(4);
18449 SDValue Src = Op.getOperand(5);
18450 SDValue Scale = Op.getOperand(6);
18451 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
18452 Scale, Chain, Subtarget);
18455 SDValue Hint = Op.getOperand(6);
18456 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
18457 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
18458 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
18459 SDValue Chain = Op.getOperand(0);
18460 SDValue Mask = Op.getOperand(2);
18461 SDValue Index = Op.getOperand(3);
18462 SDValue Base = Op.getOperand(4);
18463 SDValue Scale = Op.getOperand(5);
18464 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
18467 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
18469 SmallVector<SDValue, 2> Results;
18470 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
18472 return DAG.getMergeValues(Results, dl);
18474 // Read Performance Monitoring Counters.
18476 SmallVector<SDValue, 2> Results;
18477 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
18478 return DAG.getMergeValues(Results, dl);
18480 // XTEST intrinsics.
18482 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18483 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18484 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18485 DAG.getConstant(X86::COND_NE, dl, MVT::i8),
18487 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
18488 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
18489 Ret, SDValue(InTrans.getNode(), 1));
18493 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18494 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
18495 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
18496 DAG.getConstant(-1, dl, MVT::i8));
18497 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
18498 Op.getOperand(4), GenCF.getValue(1));
18499 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
18500 Op.getOperand(5), MachinePointerInfo());
18501 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18502 DAG.getConstant(X86::COND_B, dl, MVT::i8),
18504 SDValue Results[] = { SetCC, Store };
18505 return DAG.getMergeValues(Results, dl);
18507 case COMPRESS_TO_MEM: {
18508 SDValue Mask = Op.getOperand(4);
18509 SDValue DataToCompress = Op.getOperand(3);
18510 SDValue Addr = Op.getOperand(2);
18511 SDValue Chain = Op.getOperand(0);
18512 MVT VT = DataToCompress.getSimpleValueType();
18514 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18515 assert(MemIntr && "Expected MemIntrinsicSDNode!");
18517 if (isAllOnesConstant(Mask)) // return just a store
18518 return DAG.getStore(Chain, dl, DataToCompress, Addr,
18519 MemIntr->getMemOperand());
18521 SDValue Compressed =
18522 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
18523 Mask, DAG.getUNDEF(VT), Subtarget, DAG);
18524 return DAG.getStore(Chain, dl, Compressed, Addr,
18525 MemIntr->getMemOperand());
18527 case TRUNCATE_TO_MEM_VI8:
18528 case TRUNCATE_TO_MEM_VI16:
18529 case TRUNCATE_TO_MEM_VI32: {
18530 SDValue Mask = Op.getOperand(4);
18531 SDValue DataToTruncate = Op.getOperand(3);
18532 SDValue Addr = Op.getOperand(2);
18533 SDValue Chain = Op.getOperand(0);
18535 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18536 assert(MemIntr && "Expected MemIntrinsicSDNode!");
18538 EVT VT = MemIntr->getMemoryVT();
18540 if (isAllOnesConstant(Mask)) // return just a truncate store
18541 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
18542 MemIntr->getMemOperand());
18544 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18545 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18547 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
18548 MemIntr->getMemOperand(), true);
18550 case EXPAND_FROM_MEM: {
18551 SDValue Mask = Op.getOperand(4);
18552 SDValue PassThru = Op.getOperand(3);
18553 SDValue Addr = Op.getOperand(2);
18554 SDValue Chain = Op.getOperand(0);
18555 MVT VT = Op.getSimpleValueType();
18557 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18558 assert(MemIntr && "Expected MemIntrinsicSDNode!");
18560 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
18561 MemIntr->getMemOperand());
18563 if (isAllOnesConstant(Mask)) // return just a load
18564 return DataToExpand;
18566 SDValue Results[] = {
18567 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
18568 Mask, PassThru, Subtarget, DAG), Chain};
18569 return DAG.getMergeValues(Results, dl);
18574 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
18575 SelectionDAG &DAG) const {
18576 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18577 MFI->setReturnAddressIsTaken(true);
18579 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
18582 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18584 EVT PtrVT = getPointerTy(DAG.getDataLayout());
18587 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
18588 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18589 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
18590 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18591 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
18592 MachinePointerInfo());
18595 // Just load the return address.
18596 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
18597 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
18598 MachinePointerInfo());
18601 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
18602 MachineFunction &MF = DAG.getMachineFunction();
18603 MachineFrameInfo *MFI = MF.getFrameInfo();
18604 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18605 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18606 EVT VT = Op.getValueType();
18608 MFI->setFrameAddressIsTaken(true);
18610 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
18611 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
18612 // is not possible to crawl up the stack without looking at the unwind codes
18614 int FrameAddrIndex = FuncInfo->getFAIndex();
18615 if (!FrameAddrIndex) {
18616 // Set up a frame object for the return address.
18617 unsigned SlotSize = RegInfo->getSlotSize();
18618 FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
18619 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
18620 FuncInfo->setFAIndex(FrameAddrIndex);
18622 return DAG.getFrameIndex(FrameAddrIndex, VT);
18625 unsigned FrameReg =
18626 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18627 SDLoc dl(Op); // FIXME probably not meaningful
18628 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18629 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
18630 (FrameReg == X86::EBP && VT == MVT::i32)) &&
18631 "Invalid Frame Register!");
18632 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
18634 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
18635 MachinePointerInfo());
18639 // FIXME? Maybe this could be a TableGen attribute on some registers and
18640 // this table could be generated automatically from RegInfo.
18641 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
18642 SelectionDAG &DAG) const {
18643 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18644 const MachineFunction &MF = DAG.getMachineFunction();
18646 unsigned Reg = StringSwitch<unsigned>(RegName)
18647 .Case("esp", X86::ESP)
18648 .Case("rsp", X86::RSP)
18649 .Case("ebp", X86::EBP)
18650 .Case("rbp", X86::RBP)
18653 if (Reg == X86::EBP || Reg == X86::RBP) {
18654 if (!TFI.hasFP(MF))
18655 report_fatal_error("register " + StringRef(RegName) +
18656 " is allocatable: function has no frame pointer");
18659 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18660 unsigned FrameReg =
18661 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18662 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
18663 "Invalid Frame Register!");
18671 report_fatal_error("Invalid register name global variable");
18674 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18675 SelectionDAG &DAG) const {
18676 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18677 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
18680 unsigned X86TargetLowering::getExceptionPointerRegister(
18681 const Constant *PersonalityFn) const {
18682 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
18683 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18685 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
18688 unsigned X86TargetLowering::getExceptionSelectorRegister(
18689 const Constant *PersonalityFn) const {
18690 // Funclet personalities don't use selectors (the runtime does the selection).
18691 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
18692 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18695 bool X86TargetLowering::needsFixedCatchObjects() const {
18696 return Subtarget.isTargetWin64();
18699 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18700 SDValue Chain = Op.getOperand(0);
18701 SDValue Offset = Op.getOperand(1);
18702 SDValue Handler = Op.getOperand(2);
18705 EVT PtrVT = getPointerTy(DAG.getDataLayout());
18706 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18707 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18708 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18709 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18710 "Invalid Frame Register!");
18711 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18712 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18714 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18715 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
18717 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18718 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
18719 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18721 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18722 DAG.getRegister(StoreAddrReg, PtrVT));
18725 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18726 SelectionDAG &DAG) const {
18728 // If the subtarget is not 64bit, we may need the global base reg
18729 // after isel expand pseudo, i.e., after CGBR pass ran.
18730 // Therefore, ask for the GlobalBaseReg now, so that the pass
18731 // inserts the code for us in case we need it.
18732 // Otherwise, we will end up in a situation where we will
18733 // reference a virtual register that is not defined!
18734 if (!Subtarget.is64Bit()) {
18735 const X86InstrInfo *TII = Subtarget.getInstrInfo();
18736 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
18738 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18739 DAG.getVTList(MVT::i32, MVT::Other),
18740 Op.getOperand(0), Op.getOperand(1));
18743 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18744 SelectionDAG &DAG) const {
18746 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18747 Op.getOperand(0), Op.getOperand(1));
18750 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
18751 SelectionDAG &DAG) const {
18753 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
18757 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18758 return Op.getOperand(0);
18761 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18762 SelectionDAG &DAG) const {
18763 SDValue Root = Op.getOperand(0);
18764 SDValue Trmp = Op.getOperand(1); // trampoline
18765 SDValue FPtr = Op.getOperand(2); // nested function
18766 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18769 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18770 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
18772 if (Subtarget.is64Bit()) {
18773 SDValue OutChains[6];
18775 // Large code-model.
18776 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
18777 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18779 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18780 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18782 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18784 // Load the pointer to the nested function into R11.
18785 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18786 SDValue Addr = Trmp;
18787 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18788 Addr, MachinePointerInfo(TrmpAddr));
18790 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18791 DAG.getConstant(2, dl, MVT::i64));
18793 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
18794 /* Alignment = */ 2);
18796 // Load the 'nest' parameter value into R10.
18797 // R10 is specified in X86CallingConv.td
18798 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18799 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18800 DAG.getConstant(10, dl, MVT::i64));
18801 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18802 Addr, MachinePointerInfo(TrmpAddr, 10));
18804 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18805 DAG.getConstant(12, dl, MVT::i64));
18807 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
18808 /* Alignment = */ 2);
18810 // Jump to the nested function.
18811 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18812 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18813 DAG.getConstant(20, dl, MVT::i64));
18814 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18815 Addr, MachinePointerInfo(TrmpAddr, 20));
18817 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18818 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18819 DAG.getConstant(22, dl, MVT::i64));
18820 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
18821 Addr, MachinePointerInfo(TrmpAddr, 22));
18823 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18825 const Function *Func =
18826 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18827 CallingConv::ID CC = Func->getCallingConv();
18832 llvm_unreachable("Unsupported calling convention");
18833 case CallingConv::C:
18834 case CallingConv::X86_StdCall: {
18835 // Pass 'nest' parameter in ECX.
18836 // Must be kept in sync with X86CallingConv.td
18837 NestReg = X86::ECX;
18839 // Check that ECX wasn't needed by an 'inreg' parameter.
18840 FunctionType *FTy = Func->getFunctionType();
18841 const AttributeSet &Attrs = Func->getAttributes();
18843 if (!Attrs.isEmpty() && !Func->isVarArg()) {
18844 unsigned InRegCount = 0;
18847 for (FunctionType::param_iterator I = FTy->param_begin(),
18848 E = FTy->param_end(); I != E; ++I, ++Idx)
18849 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
18850 auto &DL = DAG.getDataLayout();
18851 // FIXME: should only count parameters that are lowered to integers.
18852 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
18855 if (InRegCount > 2) {
18856 report_fatal_error("Nest register in use - reduce number of inreg"
18862 case CallingConv::X86_FastCall:
18863 case CallingConv::X86_ThisCall:
18864 case CallingConv::Fast:
18865 // Pass 'nest' parameter in EAX.
18866 // Must be kept in sync with X86CallingConv.td
18867 NestReg = X86::EAX;
18871 SDValue OutChains[4];
18872 SDValue Addr, Disp;
18874 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18875 DAG.getConstant(10, dl, MVT::i32));
18876 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18878 // This is storing the opcode for MOV32ri.
18879 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18880 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18882 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
18883 Trmp, MachinePointerInfo(TrmpAddr));
18885 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18886 DAG.getConstant(1, dl, MVT::i32));
18888 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
18889 /* Alignment = */ 1);
18891 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18892 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18893 DAG.getConstant(5, dl, MVT::i32));
18894 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
18895 Addr, MachinePointerInfo(TrmpAddr, 5),
18896 /* Alignment = */ 1);
18898 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18899 DAG.getConstant(6, dl, MVT::i32));
18901 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
18902 /* Alignment = */ 1);
18904 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18908 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18909 SelectionDAG &DAG) const {
18911 The rounding mode is in bits 11:10 of FPSR, and has the following
18913 00 Round to nearest
18918 FLT_ROUNDS, on the other hand, expects the following:
18925 To perform the conversion, we do:
18926 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18929 MachineFunction &MF = DAG.getMachineFunction();
18930 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18931 unsigned StackAlignment = TFI.getStackAlignment();
18932 MVT VT = Op.getSimpleValueType();
18935 // Save FP Control Word to stack slot
18936 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18937 SDValue StackSlot =
18938 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
18940 MachineMemOperand *MMO =
18941 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
18942 MachineMemOperand::MOStore, 2, 2);
18944 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18945 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18946 DAG.getVTList(MVT::Other),
18947 Ops, MVT::i16, MMO);
18949 // Load FP Control Word from stack slot
18951 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
18953 // Transform as necessary
18955 DAG.getNode(ISD::SRL, DL, MVT::i16,
18956 DAG.getNode(ISD::AND, DL, MVT::i16,
18957 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
18958 DAG.getConstant(11, DL, MVT::i8));
18960 DAG.getNode(ISD::SRL, DL, MVT::i16,
18961 DAG.getNode(ISD::AND, DL, MVT::i16,
18962 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
18963 DAG.getConstant(9, DL, MVT::i8));
18966 DAG.getNode(ISD::AND, DL, MVT::i16,
18967 DAG.getNode(ISD::ADD, DL, MVT::i16,
18968 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18969 DAG.getConstant(1, DL, MVT::i16)),
18970 DAG.getConstant(3, DL, MVT::i16));
18972 return DAG.getNode((VT.getSizeInBits() < 16 ?
18973 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18976 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
18978 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
18979 // to 512-bit vector.
18980 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
18981 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
18982 // split the vector, perform operation on it's Lo a Hi part and
18983 // concatenate the results.
18984 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
18985 assert(Op.getOpcode() == ISD::CTLZ);
18987 MVT VT = Op.getSimpleValueType();
18988 MVT EltVT = VT.getVectorElementType();
18989 unsigned NumElems = VT.getVectorNumElements();
18991 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
18992 // Extend to 512 bit vector.
18993 assert((VT.is256BitVector() || VT.is128BitVector()) &&
18994 "Unsupported value type for operation");
18996 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
18997 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
18998 DAG.getUNDEF(NewVT),
19000 DAG.getIntPtrConstant(0, dl));
19001 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
19003 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
19004 DAG.getIntPtrConstant(0, dl));
19007 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
19008 "Unsupported element type");
19010 if (16 < NumElems) {
19011 // Split vector, it's Lo and Hi parts will be handled in next iteration.
19013 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
19014 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
19016 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
19017 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
19019 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
19022 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
19024 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
19025 "Unsupported value type for operation");
19027 // Use native supported vector instruction vplzcntd.
19028 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
19029 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
19030 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
19031 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
19033 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
19036 // Lower CTLZ using a PSHUFB lookup table implementation.
19037 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
19038 const X86Subtarget &Subtarget,
19039 SelectionDAG &DAG) {
19040 MVT VT = Op.getSimpleValueType();
19041 int NumElts = VT.getVectorNumElements();
19042 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
19043 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
19045 // Per-nibble leading zero PSHUFB lookup table.
19046 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
19047 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
19048 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
19049 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
19051 SmallVector<SDValue, 64> LUTVec;
19052 for (int i = 0; i < NumBytes; ++i)
19053 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
19054 SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
19056 // Begin by bitcasting the input to byte vector, then split those bytes
19057 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
19058 // If the hi input nibble is zero then we add both results together, otherwise
19059 // we just take the hi result (by masking the lo result to zero before the
19061 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
19062 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
19064 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
19065 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
19066 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
19067 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
19068 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
19070 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
19071 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
19072 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
19073 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
19075 // Merge result back from vXi8 back to VT, working on the lo/hi halves
19076 // of the current vector width in the same way we did for the nibbles.
19077 // If the upper half of the input element is zero then add the halves'
19078 // leading zero counts together, otherwise just use the upper half's.
19079 // Double the width of the result until we are at target width.
19080 while (CurrVT != VT) {
19081 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
19082 int CurrNumElts = CurrVT.getVectorNumElements();
19083 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
19084 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
19085 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
19087 // Check if the upper half of the input element is zero.
19088 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
19089 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
19090 HiZ = DAG.getBitcast(NextVT, HiZ);
19092 // Move the upper/lower halves to the lower bits as we'll be extending to
19093 // NextVT. Mask the lower result to zero if HiZ is true and add the results
19095 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
19096 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
19097 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
19098 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
19099 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
19106 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
19107 const X86Subtarget &Subtarget,
19108 SelectionDAG &DAG) {
19109 MVT VT = Op.getSimpleValueType();
19110 SDValue Op0 = Op.getOperand(0);
19112 if (Subtarget.hasAVX512())
19113 return LowerVectorCTLZ_AVX512(Op, DAG);
19115 // Decompose 256-bit ops into smaller 128-bit ops.
19116 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19117 unsigned NumElems = VT.getVectorNumElements();
19119 // Extract each 128-bit vector, perform ctlz and concat the result.
19120 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
19121 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
19123 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
19124 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
19125 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
19128 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
19129 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
19132 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
19133 SelectionDAG &DAG) {
19134 MVT VT = Op.getSimpleValueType();
19136 unsigned NumBits = VT.getSizeInBits();
19138 unsigned Opc = Op.getOpcode();
19141 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
19143 Op = Op.getOperand(0);
19144 if (VT == MVT::i8) {
19145 // Zero extend to i32 since there is not an i8 bsr.
19147 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
19150 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
19151 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
19152 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
19154 if (Opc == ISD::CTLZ) {
19155 // If src is zero (i.e. bsr sets ZF), returns NumBits.
19158 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
19159 DAG.getConstant(X86::COND_E, dl, MVT::i8),
19162 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
19165 // Finally xor with NumBits-1.
19166 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
19167 DAG.getConstant(NumBits - 1, dl, OpVT));
19170 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
19174 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
19175 MVT VT = Op.getSimpleValueType();
19176 unsigned NumBits = VT.getScalarSizeInBits();
19179 if (VT.isVector()) {
19180 SDValue N0 = Op.getOperand(0);
19181 SDValue Zero = DAG.getConstant(0, dl, VT);
19183 // lsb(x) = (x & -x)
19184 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
19185 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
19187 // cttz_undef(x) = (width - 1) - ctlz(lsb)
19188 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
19189 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
19190 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
19191 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
19194 // cttz(x) = ctpop(lsb - 1)
19195 SDValue One = DAG.getConstant(1, dl, VT);
19196 return DAG.getNode(ISD::CTPOP, dl, VT,
19197 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
19200 assert(Op.getOpcode() == ISD::CTTZ &&
19201 "Only scalar CTTZ requires custom lowering");
19203 // Issue a bsf (scan bits forward) which also sets EFLAGS.
19204 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19205 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
19207 // If src is zero (i.e. bsf sets ZF), returns NumBits.
19210 DAG.getConstant(NumBits, dl, VT),
19211 DAG.getConstant(X86::COND_E, dl, MVT::i8),
19214 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
19217 /// Break a 256-bit integer operation into two new 128-bit ones and then
19218 /// concatenate the result back.
19219 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
19220 MVT VT = Op.getSimpleValueType();
19222 assert(VT.is256BitVector() && VT.isInteger() &&
19223 "Unsupported value type for operation");
19225 unsigned NumElems = VT.getVectorNumElements();
19228 // Extract the LHS vectors
19229 SDValue LHS = Op.getOperand(0);
19230 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
19231 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
19233 // Extract the RHS vectors
19234 SDValue RHS = Op.getOperand(1);
19235 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
19236 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
19238 MVT EltVT = VT.getVectorElementType();
19239 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19241 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19242 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19243 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19246 /// Break a 512-bit integer operation into two new 256-bit ones and then
19247 /// concatenate the result back.
19248 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
19249 MVT VT = Op.getSimpleValueType();
19251 assert(VT.is512BitVector() && VT.isInteger() &&
19252 "Unsupported value type for operation");
19254 unsigned NumElems = VT.getVectorNumElements();
19257 // Extract the LHS vectors
19258 SDValue LHS = Op.getOperand(0);
19259 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
19260 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
19262 // Extract the RHS vectors
19263 SDValue RHS = Op.getOperand(1);
19264 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
19265 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
19267 MVT EltVT = VT.getVectorElementType();
19268 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19270 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19271 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19272 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19275 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
19276 if (Op.getValueType() == MVT::i1)
19277 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19278 Op.getOperand(0), Op.getOperand(1));
19279 assert(Op.getSimpleValueType().is256BitVector() &&
19280 Op.getSimpleValueType().isInteger() &&
19281 "Only handle AVX 256-bit vector integer operation");
19282 return Lower256IntArith(Op, DAG);
19285 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
19286 if (Op.getValueType() == MVT::i1)
19287 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19288 Op.getOperand(0), Op.getOperand(1));
19289 assert(Op.getSimpleValueType().is256BitVector() &&
19290 Op.getSimpleValueType().isInteger() &&
19291 "Only handle AVX 256-bit vector integer operation");
19292 return Lower256IntArith(Op, DAG);
19295 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
19296 assert(Op.getSimpleValueType().is256BitVector() &&
19297 Op.getSimpleValueType().isInteger() &&
19298 "Only handle AVX 256-bit vector integer operation");
19299 return Lower256IntArith(Op, DAG);
19302 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
19303 SelectionDAG &DAG) {
19305 MVT VT = Op.getSimpleValueType();
19308 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
19310 // Decompose 256-bit ops into smaller 128-bit ops.
19311 if (VT.is256BitVector() && !Subtarget.hasInt256())
19312 return Lower256IntArith(Op, DAG);
19314 SDValue A = Op.getOperand(0);
19315 SDValue B = Op.getOperand(1);
19317 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
19318 // vector pairs, multiply and truncate.
19319 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
19320 if (Subtarget.hasInt256()) {
19321 // For 512-bit vectors, split into 256-bit vectors to allow the
19322 // sign-extension to occur.
19323 if (VT == MVT::v64i8)
19324 return Lower512IntArith(Op, DAG);
19326 // For 256-bit vectors, split into 128-bit vectors to allow the
19327 // sign-extension to occur. We don't need this on AVX512BW as we can
19328 // safely sign-extend to v32i16.
19329 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
19330 return Lower256IntArith(Op, DAG);
19332 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
19333 return DAG.getNode(
19334 ISD::TRUNCATE, dl, VT,
19335 DAG.getNode(ISD::MUL, dl, ExVT,
19336 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
19337 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
19340 assert(VT == MVT::v16i8 &&
19341 "Pre-AVX2 support only supports v16i8 multiplication");
19342 MVT ExVT = MVT::v8i16;
19344 // Extract the lo parts and sign extend to i16
19346 if (Subtarget.hasSSE41()) {
19347 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
19348 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
19350 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19351 -1, 4, -1, 5, -1, 6, -1, 7};
19352 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19353 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19354 ALo = DAG.getBitcast(ExVT, ALo);
19355 BLo = DAG.getBitcast(ExVT, BLo);
19356 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19357 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19360 // Extract the hi parts and sign extend to i16
19362 if (Subtarget.hasSSE41()) {
19363 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
19364 -1, -1, -1, -1, -1, -1, -1, -1};
19365 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19366 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19367 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
19368 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
19370 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
19371 -1, 12, -1, 13, -1, 14, -1, 15};
19372 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19373 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19374 AHi = DAG.getBitcast(ExVT, AHi);
19375 BHi = DAG.getBitcast(ExVT, BHi);
19376 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19377 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19380 // Multiply, mask the lower 8bits of the lo/hi results and pack
19381 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19382 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19383 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
19384 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
19385 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19388 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
19389 if (VT == MVT::v4i32) {
19390 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
19391 "Should not custom lower when pmuldq is available!");
19393 // Extract the odd parts.
19394 static const int UnpackMask[] = { 1, -1, 3, -1 };
19395 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
19396 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
19398 // Multiply the even parts.
19399 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
19400 // Now multiply odd parts.
19401 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
19403 Evens = DAG.getBitcast(VT, Evens);
19404 Odds = DAG.getBitcast(VT, Odds);
19406 // Merge the two vectors back together with a shuffle. This expands into 2
19408 static const int ShufMask[] = { 0, 4, 2, 6 };
19409 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
19412 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
19413 "Only know how to lower V2I64/V4I64/V8I64 multiply");
19415 // Ahi = psrlqi(a, 32);
19416 // Bhi = psrlqi(b, 32);
19418 // AloBlo = pmuludq(a, b);
19419 // AloBhi = pmuludq(a, Bhi);
19420 // AhiBlo = pmuludq(Ahi, b);
19422 // AloBhi = psllqi(AloBhi, 32);
19423 // AhiBlo = psllqi(AhiBlo, 32);
19424 // return AloBlo + AloBhi + AhiBlo;
19426 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
19427 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
19429 SDValue AhiBlo = Ahi;
19430 SDValue AloBhi = Bhi;
19431 // Bit cast to 32-bit vectors for MULUDQ
19432 MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
19433 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
19434 A = DAG.getBitcast(MulVT, A);
19435 B = DAG.getBitcast(MulVT, B);
19436 Ahi = DAG.getBitcast(MulVT, Ahi);
19437 Bhi = DAG.getBitcast(MulVT, Bhi);
19439 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
19440 // After shifting right const values the result may be all-zero.
19441 if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
19442 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
19443 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
19445 if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
19446 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
19447 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
19450 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
19451 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
19454 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
19455 SelectionDAG &DAG) {
19457 MVT VT = Op.getSimpleValueType();
19459 // Decompose 256-bit ops into smaller 128-bit ops.
19460 if (VT.is256BitVector() && !Subtarget.hasInt256())
19461 return Lower256IntArith(Op, DAG);
19463 // Only i8 vectors should need custom lowering after this.
19464 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
19465 "Unsupported vector type");
19467 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
19468 // logical shift down the upper half and pack back to i8.
19469 SDValue A = Op.getOperand(0);
19470 SDValue B = Op.getOperand(1);
19472 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
19473 // and then ashr/lshr the upper bits down to the lower bits before multiply.
19474 unsigned Opcode = Op.getOpcode();
19475 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
19476 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
19478 // AVX2 implementations - extend xmm subvectors to ymm.
19479 if (Subtarget.hasInt256()) {
19480 SDValue Lo = DAG.getIntPtrConstant(0, dl);
19481 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
19483 if (VT == MVT::v32i8) {
19484 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
19485 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
19486 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
19487 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
19488 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
19489 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
19490 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
19491 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
19492 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19493 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
19494 DAG.getConstant(8, dl, MVT::v16i16));
19495 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19496 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
19497 DAG.getConstant(8, dl, MVT::v16i16));
19498 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
19499 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
19500 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
19501 16, 17, 18, 19, 20, 21, 22, 23};
19502 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
19503 24, 25, 26, 27, 28, 29, 30, 31};
19504 return DAG.getNode(X86ISD::PACKUS, dl, VT,
19505 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
19506 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
19509 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
19510 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
19511 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
19512 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
19513 DAG.getConstant(8, dl, MVT::v16i16));
19514 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
19515 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
19516 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
19519 assert(VT == MVT::v16i8 &&
19520 "Pre-AVX2 support only supports v16i8 multiplication");
19521 MVT ExVT = MVT::v8i16;
19523 // Extract the lo parts and zero/sign extend to i16.
19525 if (Subtarget.hasSSE41()) {
19526 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
19527 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
19529 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19530 -1, 4, -1, 5, -1, 6, -1, 7};
19531 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19532 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19533 ALo = DAG.getBitcast(ExVT, ALo);
19534 BLo = DAG.getBitcast(ExVT, BLo);
19535 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19536 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19539 // Extract the hi parts and zero/sign extend to i16.
19541 if (Subtarget.hasSSE41()) {
19542 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
19543 -1, -1, -1, -1, -1, -1, -1, -1};
19544 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19545 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19546 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
19547 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
19549 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
19550 -1, 12, -1, 13, -1, 14, -1, 15};
19551 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19552 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19553 AHi = DAG.getBitcast(ExVT, AHi);
19554 BHi = DAG.getBitcast(ExVT, BHi);
19555 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19556 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19559 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
19560 // pack back to v16i8.
19561 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19562 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19563 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
19564 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
19565 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19568 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
19569 assert(Subtarget.isTargetWin64() && "Unexpected target");
19570 EVT VT = Op.getValueType();
19571 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
19572 "Unexpected return type for lowering");
19576 switch (Op->getOpcode()) {
19577 default: llvm_unreachable("Unexpected request for libcall!");
19578 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
19579 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
19580 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
19581 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
19582 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
19583 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
19587 SDValue InChain = DAG.getEntryNode();
19589 TargetLowering::ArgListTy Args;
19590 TargetLowering::ArgListEntry Entry;
19591 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
19592 EVT ArgVT = Op->getOperand(i).getValueType();
19593 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
19594 "Unexpected argument type for lowering");
19595 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
19596 Entry.Node = StackPtr;
19597 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
19598 MachinePointerInfo(), /* Alignment = */ 16);
19599 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19600 Entry.Ty = PointerType::get(ArgTy,0);
19601 Entry.isSExt = false;
19602 Entry.isZExt = false;
19603 Args.push_back(Entry);
19606 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
19607 getPointerTy(DAG.getDataLayout()));
19609 TargetLowering::CallLoweringInfo CLI(DAG);
19610 CLI.setDebugLoc(dl).setChain(InChain)
19611 .setCallee(getLibcallCallingConv(LC),
19612 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
19613 Callee, std::move(Args))
19614 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
19616 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
19617 return DAG.getBitcast(VT, CallInfo.first);
19620 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
19621 SelectionDAG &DAG) {
19622 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
19623 MVT VT = Op0.getSimpleValueType();
19626 // Decompose 256-bit ops into smaller 128-bit ops.
19627 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19628 unsigned Opcode = Op.getOpcode();
19629 unsigned NumElems = VT.getVectorNumElements();
19630 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
19631 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
19632 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
19633 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
19634 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
19635 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
19636 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
19638 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
19639 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
19641 return DAG.getMergeValues(Ops, dl);
19644 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
19645 (VT == MVT::v8i32 && Subtarget.hasInt256()));
19647 // PMULxD operations multiply each even value (starting at 0) of LHS with
19648 // the related value of RHS and produce a widen result.
19649 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19650 // => <2 x i64> <ae|cg>
19652 // In other word, to have all the results, we need to perform two PMULxD:
19653 // 1. one with the even values.
19654 // 2. one with the odd values.
19655 // To achieve #2, with need to place the odd values at an even position.
19657 // Place the odd value at an even position (basically, shift all values 1
19658 // step to the left):
19659 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
19660 // <a|b|c|d> => <b|undef|d|undef>
19661 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
19662 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19663 // <e|f|g|h> => <f|undef|h|undef>
19664 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
19665 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19667 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
19669 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
19670 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
19672 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
19673 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19674 // => <2 x i64> <ae|cg>
19675 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
19676 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
19677 // => <2 x i64> <bf|dh>
19678 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
19680 // Shuffle it back into the right order.
19681 SDValue Highs, Lows;
19682 if (VT == MVT::v8i32) {
19683 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
19684 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19685 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
19686 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19688 const int HighMask[] = {1, 5, 3, 7};
19689 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19690 const int LowMask[] = {0, 4, 2, 6};
19691 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19694 // If we have a signed multiply but no PMULDQ fix up the high parts of a
19695 // unsigned multiply.
19696 if (IsSigned && !Subtarget.hasSSE41()) {
19697 SDValue ShAmt = DAG.getConstant(
19699 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
19700 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
19701 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
19702 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
19703 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
19705 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
19706 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
19709 // The first result of MUL_LOHI is actually the low value, followed by the
19711 SDValue Ops[] = {Lows, Highs};
19712 return DAG.getMergeValues(Ops, dl);
19715 // Return true if the required (according to Opcode) shift-imm form is natively
19716 // supported by the Subtarget
19717 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
19719 if (VT.getScalarSizeInBits() < 16)
19722 if (VT.is512BitVector() &&
19723 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
19726 bool LShift = VT.is128BitVector() ||
19727 (VT.is256BitVector() && Subtarget.hasInt256());
19729 bool AShift = LShift && (Subtarget.hasVLX() ||
19730 (VT != MVT::v2i64 && VT != MVT::v4i64));
19731 return (Opcode == ISD::SRA) ? AShift : LShift;
19734 // The shift amount is a variable, but it is the same for all vector lanes.
19735 // These instructions are defined together with shift-immediate.
19737 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
19739 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
19742 // Return true if the required (according to Opcode) variable-shift form is
19743 // natively supported by the Subtarget
19744 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
19747 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
19750 // vXi16 supported only on AVX-512, BWI
19751 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
19754 if (VT.is512BitVector() || Subtarget.hasVLX())
19757 bool LShift = VT.is128BitVector() || VT.is256BitVector();
19758 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
19759 return (Opcode == ISD::SRA) ? AShift : LShift;
19762 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
19763 const X86Subtarget &Subtarget) {
19764 MVT VT = Op.getSimpleValueType();
19766 SDValue R = Op.getOperand(0);
19767 SDValue Amt = Op.getOperand(1);
19769 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19770 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19772 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
19773 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
19774 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
19775 SDValue Ex = DAG.getBitcast(ExVT, R);
19777 if (ShiftAmt >= 32) {
19778 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
19780 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
19781 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19782 ShiftAmt - 32, DAG);
19783 if (VT == MVT::v2i64)
19784 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
19785 if (VT == MVT::v4i64)
19786 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19787 {9, 1, 11, 3, 13, 5, 15, 7});
19789 // SRA upper i32, SHL whole i64 and select lower i32.
19790 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19793 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
19794 Lower = DAG.getBitcast(ExVT, Lower);
19795 if (VT == MVT::v2i64)
19796 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
19797 if (VT == MVT::v4i64)
19798 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19799 {8, 1, 10, 3, 12, 5, 14, 7});
19801 return DAG.getBitcast(VT, Ex);
19804 // Optimize shl/srl/sra with constant shift amount.
19805 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
19806 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
19807 uint64_t ShiftAmt = ShiftConst->getZExtValue();
19809 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19810 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19812 // i64 SRA needs to be performed as partial shifts.
19813 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
19814 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
19815 return ArithmeticShiftRight64(ShiftAmt);
19817 if (VT == MVT::v16i8 ||
19818 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
19819 VT == MVT::v64i8) {
19820 unsigned NumElts = VT.getVectorNumElements();
19821 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
19823 // Simple i8 add case
19824 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
19825 return DAG.getNode(ISD::ADD, dl, VT, R, R);
19827 // ashr(R, 7) === cmp_slt(R, 0)
19828 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
19829 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
19830 if (VT.is512BitVector()) {
19831 assert(VT == MVT::v64i8 && "Unexpected element type!");
19832 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
19833 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
19835 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
19838 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
19839 if (VT == MVT::v16i8 && Subtarget.hasXOP())
19842 if (Op.getOpcode() == ISD::SHL) {
19843 // Make a large shift.
19844 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
19846 SHL = DAG.getBitcast(VT, SHL);
19847 // Zero out the rightmost bits.
19848 return DAG.getNode(ISD::AND, dl, VT, SHL,
19849 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
19851 if (Op.getOpcode() == ISD::SRL) {
19852 // Make a large shift.
19853 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
19855 SRL = DAG.getBitcast(VT, SRL);
19856 // Zero out the leftmost bits.
19857 return DAG.getNode(ISD::AND, dl, VT, SRL,
19858 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
19860 if (Op.getOpcode() == ISD::SRA) {
19861 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
19862 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
19864 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
19865 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
19866 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
19869 llvm_unreachable("Unknown shift opcode.");
19874 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19875 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
19876 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
19878 // Peek through any splat that was introduced for i64 shift vectorization.
19879 int SplatIndex = -1;
19880 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
19881 if (SVN->isSplat()) {
19882 SplatIndex = SVN->getSplatIndex();
19883 Amt = Amt.getOperand(0);
19884 assert(SplatIndex < (int)VT.getVectorNumElements() &&
19885 "Splat shuffle referencing second operand");
19888 if (Amt.getOpcode() != ISD::BITCAST ||
19889 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
19892 Amt = Amt.getOperand(0);
19893 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19894 VT.getVectorNumElements();
19895 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
19896 uint64_t ShiftAmt = 0;
19897 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
19898 for (unsigned i = 0; i != Ratio; ++i) {
19899 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
19903 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
19906 // Check remaining shift amounts (if not a splat).
19907 if (SplatIndex < 0) {
19908 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19909 uint64_t ShAmt = 0;
19910 for (unsigned j = 0; j != Ratio; ++j) {
19911 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
19915 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
19917 if (ShAmt != ShiftAmt)
19922 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19923 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19925 if (Op.getOpcode() == ISD::SRA)
19926 return ArithmeticShiftRight64(ShiftAmt);
19932 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
19933 const X86Subtarget &Subtarget) {
19934 MVT VT = Op.getSimpleValueType();
19936 SDValue R = Op.getOperand(0);
19937 SDValue Amt = Op.getOperand(1);
19939 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19940 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19942 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
19943 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
19945 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
19947 MVT EltVT = VT.getVectorElementType();
19949 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
19950 // Check if this build_vector node is doing a splat.
19951 // If so, then set BaseShAmt equal to the splat value.
19952 BaseShAmt = BV->getSplatValue();
19953 if (BaseShAmt && BaseShAmt.isUndef())
19954 BaseShAmt = SDValue();
19956 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
19957 Amt = Amt.getOperand(0);
19959 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
19960 if (SVN && SVN->isSplat()) {
19961 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
19962 SDValue InVec = Amt.getOperand(0);
19963 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
19964 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
19965 "Unexpected shuffle index found!");
19966 BaseShAmt = InVec.getOperand(SplatIdx);
19967 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
19968 if (ConstantSDNode *C =
19969 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
19970 if (C->getZExtValue() == SplatIdx)
19971 BaseShAmt = InVec.getOperand(1);
19976 // Avoid introducing an extract element from a shuffle.
19977 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
19978 DAG.getIntPtrConstant(SplatIdx, dl));
19982 if (BaseShAmt.getNode()) {
19983 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
19984 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
19985 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
19986 else if (EltVT.bitsLT(MVT::i32))
19987 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
19989 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
19993 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19994 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
19995 Amt.getOpcode() == ISD::BITCAST &&
19996 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19997 Amt = Amt.getOperand(0);
19998 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19999 VT.getVectorNumElements();
20000 std::vector<SDValue> Vals(Ratio);
20001 for (unsigned i = 0; i != Ratio; ++i)
20002 Vals[i] = Amt.getOperand(i);
20003 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
20004 for (unsigned j = 0; j != Ratio; ++j)
20005 if (Vals[j] != Amt.getOperand(i + j))
20009 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
20010 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
20015 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
20016 SelectionDAG &DAG) {
20017 MVT VT = Op.getSimpleValueType();
20019 SDValue R = Op.getOperand(0);
20020 SDValue Amt = Op.getOperand(1);
20021 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20023 assert(VT.isVector() && "Custom lowering only for vector shifts!");
20024 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
20026 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
20029 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
20032 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
20035 // XOP has 128-bit variable logical/arithmetic shifts.
20036 // +ve/-ve Amt = shift left/right.
20037 if (Subtarget.hasXOP() &&
20038 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
20039 VT == MVT::v8i16 || VT == MVT::v16i8)) {
20040 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
20041 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20042 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
20044 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
20045 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
20046 if (Op.getOpcode() == ISD::SRA)
20047 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
20050 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
20051 // shifts per-lane and then shuffle the partial results back together.
20052 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
20053 // Splat the shift amounts so the scalar shifts above will catch it.
20054 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
20055 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
20056 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
20057 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
20058 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
20061 // i64 vector arithmetic shift can be emulated with the transform:
20062 // M = lshr(SIGN_BIT, Amt)
20063 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
20064 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
20065 Op.getOpcode() == ISD::SRA) {
20066 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
20067 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
20068 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
20069 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
20070 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
20074 // If possible, lower this packed shift into a vector multiply instead of
20075 // expanding it into a sequence of scalar shifts.
20076 // Do this only if the vector shift count is a constant build_vector.
20077 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
20078 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
20079 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
20080 SmallVector<SDValue, 8> Elts;
20081 MVT SVT = VT.getVectorElementType();
20082 unsigned SVTBits = SVT.getSizeInBits();
20083 APInt One(SVTBits, 1);
20084 unsigned NumElems = VT.getVectorNumElements();
20086 for (unsigned i=0; i !=NumElems; ++i) {
20087 SDValue Op = Amt->getOperand(i);
20088 if (Op->isUndef()) {
20089 Elts.push_back(Op);
20093 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
20094 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
20095 uint64_t ShAmt = C.getZExtValue();
20096 if (ShAmt >= SVTBits) {
20097 Elts.push_back(DAG.getUNDEF(SVT));
20100 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
20102 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
20103 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
20106 // Lower SHL with variable shift amount.
20107 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
20108 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
20110 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
20111 DAG.getConstant(0x3f800000U, dl, VT));
20112 Op = DAG.getBitcast(MVT::v4f32, Op);
20113 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
20114 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
20117 // If possible, lower this shift as a sequence of two shifts by
20118 // constant plus a MOVSS/MOVSD instead of scalarizing it.
20120 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
20122 // Could be rewritten as:
20123 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
20125 // The advantage is that the two shifts from the example would be
20126 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
20127 // the vector shift into four scalar shifts plus four pairs of vector
20129 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
20130 unsigned TargetOpcode = X86ISD::MOVSS;
20131 bool CanBeSimplified;
20132 // The splat value for the first packed shift (the 'X' from the example).
20133 SDValue Amt1 = Amt->getOperand(0);
20134 // The splat value for the second packed shift (the 'Y' from the example).
20135 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
20137 // See if it is possible to replace this node with a sequence of
20138 // two shifts followed by a MOVSS/MOVSD
20139 if (VT == MVT::v4i32) {
20140 // Check if it is legal to use a MOVSS.
20141 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
20142 Amt2 == Amt->getOperand(3);
20143 if (!CanBeSimplified) {
20144 // Otherwise, check if we can still simplify this node using a MOVSD.
20145 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
20146 Amt->getOperand(2) == Amt->getOperand(3);
20147 TargetOpcode = X86ISD::MOVSD;
20148 Amt2 = Amt->getOperand(2);
20151 // Do similar checks for the case where the machine value type
20153 CanBeSimplified = Amt1 == Amt->getOperand(1);
20154 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
20155 CanBeSimplified = Amt2 == Amt->getOperand(i);
20157 if (!CanBeSimplified) {
20158 TargetOpcode = X86ISD::MOVSD;
20159 CanBeSimplified = true;
20160 Amt2 = Amt->getOperand(4);
20161 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
20162 CanBeSimplified = Amt1 == Amt->getOperand(i);
20163 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
20164 CanBeSimplified = Amt2 == Amt->getOperand(j);
20168 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
20169 isa<ConstantSDNode>(Amt2)) {
20170 // Replace this node with two shifts followed by a MOVSS/MOVSD.
20171 MVT CastVT = MVT::v4i32;
20173 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
20174 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
20176 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
20177 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
20178 if (TargetOpcode == X86ISD::MOVSD)
20179 CastVT = MVT::v2i64;
20180 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
20181 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
20182 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
20184 return DAG.getBitcast(VT, Result);
20188 // v4i32 Non Uniform Shifts.
20189 // If the shift amount is constant we can shift each lane using the SSE2
20190 // immediate shifts, else we need to zero-extend each lane to the lower i64
20191 // and shift using the SSE2 variable shifts.
20192 // The separate results can then be blended together.
20193 if (VT == MVT::v4i32) {
20194 unsigned Opc = Op.getOpcode();
20195 SDValue Amt0, Amt1, Amt2, Amt3;
20197 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
20198 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
20199 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
20200 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
20202 // ISD::SHL is handled above but we include it here for completeness.
20205 llvm_unreachable("Unknown target vector shift node");
20207 Opc = X86ISD::VSHL;
20210 Opc = X86ISD::VSRL;
20213 Opc = X86ISD::VSRA;
20216 // The SSE2 shifts use the lower i64 as the same shift amount for
20217 // all lanes and the upper i64 is ignored. These shuffle masks
20218 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
20219 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20220 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
20221 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
20222 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
20223 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
20226 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
20227 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
20228 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
20229 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
20230 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
20231 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
20232 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
20235 if (VT == MVT::v16i8 ||
20236 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
20237 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
20238 unsigned ShiftOpcode = Op->getOpcode();
20240 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
20241 // On SSE41 targets we make use of the fact that VSELECT lowers
20242 // to PBLENDVB which selects bytes based just on the sign bit.
20243 if (Subtarget.hasSSE41()) {
20244 V0 = DAG.getBitcast(VT, V0);
20245 V1 = DAG.getBitcast(VT, V1);
20246 Sel = DAG.getBitcast(VT, Sel);
20247 return DAG.getBitcast(SelVT,
20248 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
20250 // On pre-SSE41 targets we test for the sign bit by comparing to
20251 // zero - a negative value will set all bits of the lanes to true
20252 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
20253 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
20254 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
20255 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
20258 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
20259 // We can safely do this using i16 shifts as we're only interested in
20260 // the 3 lower bits of each byte.
20261 Amt = DAG.getBitcast(ExtVT, Amt);
20262 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
20263 Amt = DAG.getBitcast(VT, Amt);
20265 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
20266 // r = VSELECT(r, shift(r, 4), a);
20268 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20269 R = SignBitSelect(VT, Amt, M, R);
20272 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20274 // r = VSELECT(r, shift(r, 2), a);
20275 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20276 R = SignBitSelect(VT, Amt, M, R);
20279 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20281 // return VSELECT(r, shift(r, 1), a);
20282 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20283 R = SignBitSelect(VT, Amt, M, R);
20287 if (Op->getOpcode() == ISD::SRA) {
20288 // For SRA we need to unpack each byte to the higher byte of a i16 vector
20289 // so we can correctly sign extend. We don't care what happens to the
20291 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
20292 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
20293 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
20294 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
20295 ALo = DAG.getBitcast(ExtVT, ALo);
20296 AHi = DAG.getBitcast(ExtVT, AHi);
20297 RLo = DAG.getBitcast(ExtVT, RLo);
20298 RHi = DAG.getBitcast(ExtVT, RHi);
20300 // r = VSELECT(r, shift(r, 4), a);
20301 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20302 DAG.getConstant(4, dl, ExtVT));
20303 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20304 DAG.getConstant(4, dl, ExtVT));
20305 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20306 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20309 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20310 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20312 // r = VSELECT(r, shift(r, 2), a);
20313 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20314 DAG.getConstant(2, dl, ExtVT));
20315 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20316 DAG.getConstant(2, dl, ExtVT));
20317 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20318 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20321 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20322 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20324 // r = VSELECT(r, shift(r, 1), a);
20325 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20326 DAG.getConstant(1, dl, ExtVT));
20327 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20328 DAG.getConstant(1, dl, ExtVT));
20329 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20330 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20332 // Logical shift the result back to the lower byte, leaving a zero upper
20334 // meaning that we can safely pack with PACKUSWB.
20336 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
20338 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
20339 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20343 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
20344 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
20345 // solution better.
20346 if (Subtarget.hasInt256() && VT == MVT::v8i16) {
20347 MVT ExtVT = MVT::v8i32;
20349 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20350 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
20351 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
20352 return DAG.getNode(ISD::TRUNCATE, dl, VT,
20353 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
20356 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
20357 MVT ExtVT = MVT::v8i32;
20358 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20359 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
20360 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
20361 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
20362 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
20363 ALo = DAG.getBitcast(ExtVT, ALo);
20364 AHi = DAG.getBitcast(ExtVT, AHi);
20365 RLo = DAG.getBitcast(ExtVT, RLo);
20366 RHi = DAG.getBitcast(ExtVT, RHi);
20367 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
20368 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
20369 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
20370 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
20371 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20374 if (VT == MVT::v8i16) {
20375 unsigned ShiftOpcode = Op->getOpcode();
20377 // If we have a constant shift amount, the non-SSE41 path is best as
20378 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
20379 bool UseSSE41 = Subtarget.hasSSE41() &&
20380 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20382 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
20383 // On SSE41 targets we make use of the fact that VSELECT lowers
20384 // to PBLENDVB which selects bytes based just on the sign bit.
20386 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
20387 V0 = DAG.getBitcast(ExtVT, V0);
20388 V1 = DAG.getBitcast(ExtVT, V1);
20389 Sel = DAG.getBitcast(ExtVT, Sel);
20390 return DAG.getBitcast(
20391 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
20393 // On pre-SSE41 targets we splat the sign bit - a negative value will
20394 // set all bits of the lanes to true and VSELECT uses that in
20395 // its OR(AND(V0,C),AND(V1,~C)) lowering.
20397 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
20398 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
20401 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
20403 // On SSE41 targets we need to replicate the shift mask in both
20404 // bytes for PBLENDVB.
20407 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
20408 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
20410 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
20413 // r = VSELECT(r, shift(r, 8), a);
20414 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
20415 R = SignBitSelect(Amt, M, R);
20418 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20420 // r = VSELECT(r, shift(r, 4), a);
20421 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20422 R = SignBitSelect(Amt, M, R);
20425 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20427 // r = VSELECT(r, shift(r, 2), a);
20428 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20429 R = SignBitSelect(Amt, M, R);
20432 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20434 // return VSELECT(r, shift(r, 1), a);
20435 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20436 R = SignBitSelect(Amt, M, R);
20440 // Decompose 256-bit shifts into smaller 128-bit shifts.
20441 if (VT.is256BitVector())
20442 return Lower256IntArith(Op, DAG);
20447 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
20448 SelectionDAG &DAG) {
20449 MVT VT = Op.getSimpleValueType();
20451 SDValue R = Op.getOperand(0);
20452 SDValue Amt = Op.getOperand(1);
20454 assert(VT.isVector() && "Custom lowering only for vector rotates!");
20455 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
20456 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
20458 // XOP has 128-bit vector variable + immediate rotates.
20459 // +ve/-ve Amt = rotate left/right.
20461 // Split 256-bit integers.
20462 if (VT.is256BitVector())
20463 return Lower256IntArith(Op, DAG);
20465 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
20467 // Attempt to rotate by immediate.
20468 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
20469 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
20470 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
20471 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
20472 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
20473 DAG.getConstant(RotateAmt, DL, MVT::i8));
20477 // Use general rotate by variable (per-element).
20478 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
20481 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
20482 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
20483 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
20484 // looks for this combo and may remove the "setcc" instruction if the "setcc"
20485 // has only one use.
20486 SDNode *N = Op.getNode();
20487 SDValue LHS = N->getOperand(0);
20488 SDValue RHS = N->getOperand(1);
20489 unsigned BaseOp = 0;
20492 switch (Op.getOpcode()) {
20493 default: llvm_unreachable("Unknown ovf instruction!");
20495 // A subtract of one will be selected as a INC. Note that INC doesn't
20496 // set CF, so we can't do this for UADDO.
20497 if (isOneConstant(RHS)) {
20498 BaseOp = X86ISD::INC;
20499 Cond = X86::COND_O;
20502 BaseOp = X86ISD::ADD;
20503 Cond = X86::COND_O;
20506 BaseOp = X86ISD::ADD;
20507 Cond = X86::COND_B;
20510 // A subtract of one will be selected as a DEC. Note that DEC doesn't
20511 // set CF, so we can't do this for USUBO.
20512 if (isOneConstant(RHS)) {
20513 BaseOp = X86ISD::DEC;
20514 Cond = X86::COND_O;
20517 BaseOp = X86ISD::SUB;
20518 Cond = X86::COND_O;
20521 BaseOp = X86ISD::SUB;
20522 Cond = X86::COND_B;
20525 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
20526 Cond = X86::COND_O;
20528 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
20529 if (N->getValueType(0) == MVT::i8) {
20530 BaseOp = X86ISD::UMUL8;
20531 Cond = X86::COND_O;
20534 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
20536 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
20539 DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20540 DAG.getConstant(X86::COND_O, DL, MVT::i32),
20541 SDValue(Sum.getNode(), 2));
20543 if (N->getValueType(1) == MVT::i1) {
20544 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20545 DAG.getValueType(MVT::i1));
20546 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20548 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20552 // Also sets EFLAGS.
20553 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
20554 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
20557 DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20558 DAG.getConstant(Cond, DL, MVT::i32),
20559 SDValue(Sum.getNode(), 1));
20561 if (N->getValueType(1) == MVT::i1) {
20562 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20563 DAG.getValueType(MVT::i1));
20564 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20566 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20569 /// Returns true if the operand type is exactly twice the native width, and
20570 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
20571 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
20572 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
20573 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
20574 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
20577 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
20578 else if (OpWidth == 128)
20579 return Subtarget.hasCmpxchg16b();
20584 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20585 return needsCmpXchgNb(SI->getValueOperand()->getType());
20588 // Note: this turns large loads into lock cmpxchg8b/16b.
20589 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
20590 TargetLowering::AtomicExpansionKind
20591 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20592 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
20593 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
20594 : AtomicExpansionKind::None;
20597 TargetLowering::AtomicExpansionKind
20598 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
20599 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20600 Type *MemType = AI->getType();
20602 // If the operand is too big, we must see if cmpxchg8/16b is available
20603 // and default to library calls otherwise.
20604 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
20605 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
20606 : AtomicExpansionKind::None;
20609 AtomicRMWInst::BinOp Op = AI->getOperation();
20612 llvm_unreachable("Unknown atomic operation");
20613 case AtomicRMWInst::Xchg:
20614 case AtomicRMWInst::Add:
20615 case AtomicRMWInst::Sub:
20616 // It's better to use xadd, xsub or xchg for these in all cases.
20617 return AtomicExpansionKind::None;
20618 case AtomicRMWInst::Or:
20619 case AtomicRMWInst::And:
20620 case AtomicRMWInst::Xor:
20621 // If the atomicrmw's result isn't actually used, we can just add a "lock"
20622 // prefix to a normal instruction for these operations.
20623 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
20624 : AtomicExpansionKind::None;
20625 case AtomicRMWInst::Nand:
20626 case AtomicRMWInst::Max:
20627 case AtomicRMWInst::Min:
20628 case AtomicRMWInst::UMax:
20629 case AtomicRMWInst::UMin:
20630 // These always require a non-trivial set of data operations on x86. We must
20631 // use a cmpxchg loop.
20632 return AtomicExpansionKind::CmpXChg;
20637 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
20638 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20639 Type *MemType = AI->getType();
20640 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
20641 // there is no benefit in turning such RMWs into loads, and it is actually
20642 // harmful as it introduces a mfence.
20643 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
20646 auto Builder = IRBuilder<>(AI);
20647 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20648 auto SynchScope = AI->getSynchScope();
20649 // We must restrict the ordering to avoid generating loads with Release or
20650 // ReleaseAcquire orderings.
20651 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
20652 auto Ptr = AI->getPointerOperand();
20654 // Before the load we need a fence. Here is an example lifted from
20655 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
20658 // x.store(1, relaxed);
20659 // r1 = y.fetch_add(0, release);
20661 // y.fetch_add(42, acquire);
20662 // r2 = x.load(relaxed);
20663 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
20664 // lowered to just a load without a fence. A mfence flushes the store buffer,
20665 // making the optimization clearly correct.
20666 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
20667 // otherwise, we might be able to be more aggressive on relaxed idempotent
20668 // rmw. In practice, they do not look useful, so we don't try to be
20669 // especially clever.
20670 if (SynchScope == SingleThread)
20671 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
20672 // the IR level, so we must wrap it in an intrinsic.
20675 if (!Subtarget.hasMFence())
20676 // FIXME: it might make sense to use a locked operation here but on a
20677 // different cache-line to prevent cache-line bouncing. In practice it
20678 // is probably a small win, and x86 processors without mfence are rare
20679 // enough that we do not bother.
20683 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
20684 Builder.CreateCall(MFence, {});
20686 // Finally we can emit the atomic load.
20687 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
20688 AI->getType()->getPrimitiveSizeInBits());
20689 Loaded->setAtomic(Order, SynchScope);
20690 AI->replaceAllUsesWith(Loaded);
20691 AI->eraseFromParent();
20695 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
20696 SelectionDAG &DAG) {
20698 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
20699 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
20700 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
20701 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
20703 // The only fence that needs an instruction is a sequentially-consistent
20704 // cross-thread fence.
20705 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
20706 FenceScope == CrossThread) {
20707 if (Subtarget.hasMFence())
20708 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
20710 SDValue Chain = Op.getOperand(0);
20711 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
20713 DAG.getRegister(X86::ESP, MVT::i32), // Base
20714 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
20715 DAG.getRegister(0, MVT::i32), // Index
20716 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
20717 DAG.getRegister(0, MVT::i32), // Segment.
20721 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
20722 return SDValue(Res, 0);
20725 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
20726 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
20729 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
20730 SelectionDAG &DAG) {
20731 MVT T = Op.getSimpleValueType();
20735 switch(T.SimpleTy) {
20736 default: llvm_unreachable("Invalid value type!");
20737 case MVT::i8: Reg = X86::AL; size = 1; break;
20738 case MVT::i16: Reg = X86::AX; size = 2; break;
20739 case MVT::i32: Reg = X86::EAX; size = 4; break;
20741 assert(Subtarget.is64Bit() && "Node not type legal!");
20742 Reg = X86::RAX; size = 8;
20745 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
20746 Op.getOperand(2), SDValue());
20747 SDValue Ops[] = { cpIn.getValue(0),
20750 DAG.getTargetConstant(size, DL, MVT::i8),
20751 cpIn.getValue(1) };
20752 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20753 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
20754 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
20758 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
20759 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
20760 MVT::i32, cpOut.getValue(2));
20761 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
20762 DAG.getConstant(X86::COND_E, DL, MVT::i8),
20765 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
20766 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
20767 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
20771 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
20772 SelectionDAG &DAG) {
20773 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
20774 MVT DstVT = Op.getSimpleValueType();
20776 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
20777 SrcVT == MVT::i64) {
20778 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
20779 if (DstVT != MVT::f64)
20780 // This conversion needs to be expanded.
20783 SDValue Op0 = Op->getOperand(0);
20784 SmallVector<SDValue, 16> Elts;
20788 if (SrcVT.isVector()) {
20789 NumElts = SrcVT.getVectorNumElements();
20790 SVT = SrcVT.getVectorElementType();
20792 // Widen the vector in input in the case of MVT::v2i32.
20793 // Example: from MVT::v2i32 to MVT::v4i32.
20794 for (unsigned i = 0, e = NumElts; i != e; ++i)
20795 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
20796 DAG.getIntPtrConstant(i, dl)));
20798 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
20799 "Unexpected source type in LowerBITCAST");
20800 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20801 DAG.getIntPtrConstant(0, dl)));
20802 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20803 DAG.getIntPtrConstant(1, dl)));
20807 // Explicitly mark the extra elements as Undef.
20808 Elts.append(NumElts, DAG.getUNDEF(SVT));
20810 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20811 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
20812 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
20813 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
20814 DAG.getIntPtrConstant(0, dl));
20817 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
20818 Subtarget.hasMMX() && "Unexpected custom BITCAST");
20819 assert((DstVT == MVT::i64 ||
20820 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
20821 "Unexpected custom BITCAST");
20822 // i64 <=> MMX conversions are Legal.
20823 if (SrcVT==MVT::i64 && DstVT.isVector())
20825 if (DstVT==MVT::i64 && SrcVT.isVector())
20827 // MMX <=> MMX conversions are Legal.
20828 if (SrcVT.isVector() && DstVT.isVector())
20830 // All other conversions need to be expanded.
20834 /// Compute the horizontal sum of bytes in V for the elements of VT.
20836 /// Requires V to be a byte vector and VT to be an integer vector type with
20837 /// wider elements than V's type. The width of the elements of VT determines
20838 /// how many bytes of V are summed horizontally to produce each element of the
20840 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
20841 const X86Subtarget &Subtarget,
20842 SelectionDAG &DAG) {
20844 MVT ByteVecVT = V.getSimpleValueType();
20845 MVT EltVT = VT.getVectorElementType();
20846 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
20847 "Expected value to have byte element type.");
20848 assert(EltVT != MVT::i8 &&
20849 "Horizontal byte sum only makes sense for wider elements!");
20850 unsigned VecSize = VT.getSizeInBits();
20851 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
20853 // PSADBW instruction horizontally add all bytes and leave the result in i64
20854 // chunks, thus directly computes the pop count for v2i64 and v4i64.
20855 if (EltVT == MVT::i64) {
20856 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20857 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20858 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
20859 return DAG.getBitcast(VT, V);
20862 if (EltVT == MVT::i32) {
20863 // We unpack the low half and high half into i32s interleaved with zeros so
20864 // that we can use PSADBW to horizontally sum them. The most useful part of
20865 // this is that it lines up the results of two PSADBW instructions to be
20866 // two v2i64 vectors which concatenated are the 4 population counts. We can
20867 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
20868 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
20869 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
20870 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
20872 // Do the horizontal sums into two v2i64s.
20873 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20874 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20875 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20876 DAG.getBitcast(ByteVecVT, Low), Zeros);
20877 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20878 DAG.getBitcast(ByteVecVT, High), Zeros);
20880 // Merge them together.
20881 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
20882 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
20883 DAG.getBitcast(ShortVecVT, Low),
20884 DAG.getBitcast(ShortVecVT, High));
20886 return DAG.getBitcast(VT, V);
20889 // The only element type left is i16.
20890 assert(EltVT == MVT::i16 && "Unknown how to handle type");
20892 // To obtain pop count for each i16 element starting from the pop count for
20893 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
20894 // right by 8. It is important to shift as i16s as i8 vector shift isn't
20895 // directly supported.
20896 SDValue ShifterV = DAG.getConstant(8, DL, VT);
20897 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20898 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
20899 DAG.getBitcast(ByteVecVT, V));
20900 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20903 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
20904 const X86Subtarget &Subtarget,
20905 SelectionDAG &DAG) {
20906 MVT VT = Op.getSimpleValueType();
20907 MVT EltVT = VT.getVectorElementType();
20908 unsigned VecSize = VT.getSizeInBits();
20910 // Implement a lookup table in register by using an algorithm based on:
20911 // http://wm.ite.pl/articles/sse-popcount.html
20913 // The general idea is that every lower byte nibble in the input vector is an
20914 // index into a in-register pre-computed pop count table. We then split up the
20915 // input vector in two new ones: (1) a vector with only the shifted-right
20916 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
20917 // masked out higher ones) for each byte. PSHUB is used separately with both
20918 // to index the in-register table. Next, both are added and the result is a
20919 // i8 vector where each element contains the pop count for input byte.
20921 // To obtain the pop count for elements != i8, we follow up with the same
20922 // approach and use additional tricks as described below.
20924 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
20925 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
20926 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
20927 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
20929 int NumByteElts = VecSize / 8;
20930 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
20931 SDValue In = DAG.getBitcast(ByteVecVT, Op);
20932 SmallVector<SDValue, 64> LUTVec;
20933 for (int i = 0; i < NumByteElts; ++i)
20934 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20935 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
20936 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
20939 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
20940 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
20943 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
20945 // The input vector is used as the shuffle mask that index elements into the
20946 // LUT. After counting low and high nibbles, add the vector to obtain the
20947 // final pop count per i8 element.
20948 SDValue HighPopCnt =
20949 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
20950 SDValue LowPopCnt =
20951 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
20952 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
20954 if (EltVT == MVT::i8)
20957 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
20960 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
20961 const X86Subtarget &Subtarget,
20962 SelectionDAG &DAG) {
20963 MVT VT = Op.getSimpleValueType();
20964 assert(VT.is128BitVector() &&
20965 "Only 128-bit vector bitmath lowering supported.");
20967 int VecSize = VT.getSizeInBits();
20968 MVT EltVT = VT.getVectorElementType();
20969 int Len = EltVT.getSizeInBits();
20971 // This is the vectorized version of the "best" algorithm from
20972 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
20973 // with a minor tweak to use a series of adds + shifts instead of vector
20974 // multiplications. Implemented for all integer vector types. We only use
20975 // this when we don't have SSSE3 which allows a LUT-based lowering that is
20976 // much faster, even faster than using native popcnt instructions.
20978 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
20979 MVT VT = V.getSimpleValueType();
20980 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
20981 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
20983 auto GetMask = [&](SDValue V, APInt Mask) {
20984 MVT VT = V.getSimpleValueType();
20985 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
20986 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
20989 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
20990 // x86, so set the SRL type to have elements at least i16 wide. This is
20991 // correct because all of our SRLs are followed immediately by a mask anyways
20992 // that handles any bits that sneak into the high bits of the byte elements.
20993 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
20997 // v = v - ((v >> 1) & 0x55555555...)
20999 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
21000 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
21001 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
21003 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
21004 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
21005 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
21006 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
21007 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
21009 // v = (v + (v >> 4)) & 0x0F0F0F0F...
21010 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
21011 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
21012 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
21014 // At this point, V contains the byte-wise population count, and we are
21015 // merely doing a horizontal sum if necessary to get the wider element
21017 if (EltVT == MVT::i8)
21020 return LowerHorizontalByteSum(
21021 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
21025 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21026 SelectionDAG &DAG) {
21027 MVT VT = Op.getSimpleValueType();
21028 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
21029 "Unknown CTPOP type to handle");
21030 SDLoc DL(Op.getNode());
21031 SDValue Op0 = Op.getOperand(0);
21033 if (!Subtarget.hasSSSE3()) {
21034 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
21035 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
21036 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
21039 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21040 unsigned NumElems = VT.getVectorNumElements();
21042 // Extract each 128-bit vector, compute pop count and concat the result.
21043 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
21044 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
21046 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21047 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21048 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21051 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
21052 unsigned NumElems = VT.getVectorNumElements();
21054 // Extract each 256-bit vector, compute pop count and concat the result.
21055 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
21056 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
21058 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21059 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21060 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21063 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
21066 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21067 SelectionDAG &DAG) {
21068 assert(Op.getSimpleValueType().isVector() &&
21069 "We only do custom lowering for vector population count.");
21070 return LowerVectorCTPOP(Op, Subtarget, DAG);
21073 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
21074 MVT VT = Op.getSimpleValueType();
21075 SDValue In = Op.getOperand(0);
21078 // For scalars, its still beneficial to transfer to/from the SIMD unit to
21079 // perform the BITREVERSE.
21080 if (!VT.isVector()) {
21081 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
21082 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
21083 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
21084 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
21085 DAG.getIntPtrConstant(0, DL));
21088 MVT SVT = VT.getVectorElementType();
21089 int NumElts = VT.getVectorNumElements();
21090 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
21092 // Decompose 256-bit ops into smaller 128-bit ops.
21093 if (VT.is256BitVector()) {
21094 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21095 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21097 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
21098 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21099 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
21100 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
21103 assert(VT.is128BitVector() &&
21104 "Only 128-bit vector bitreverse lowering supported.");
21106 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
21107 // perform the BSWAP in the shuffle.
21108 // Its best to shuffle using the second operand as this will implicitly allow
21109 // memory folding for multiple vectors.
21110 SmallVector<SDValue, 16> MaskElts;
21111 for (int i = 0; i != NumElts; ++i) {
21112 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
21113 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
21114 int PermuteByte = SourceByte | (2 << 5);
21115 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
21119 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
21120 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
21121 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
21123 return DAG.getBitcast(VT, Res);
21126 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
21127 SelectionDAG &DAG) {
21128 if (Subtarget.hasXOP())
21129 return LowerBITREVERSE_XOP(Op, DAG);
21131 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
21133 MVT VT = Op.getSimpleValueType();
21134 SDValue In = Op.getOperand(0);
21137 unsigned NumElts = VT.getVectorNumElements();
21138 assert(VT.getScalarType() == MVT::i8 &&
21139 "Only byte vector BITREVERSE supported");
21141 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
21142 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21143 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
21144 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21145 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21146 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
21147 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
21148 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21151 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
21152 // two nibbles and a PSHUFB lookup to find the bitreverse of each
21153 // 0-15 value (moved to the other nibble).
21154 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
21155 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
21156 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
21158 const int LoLUT[16] = {
21159 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
21160 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
21161 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
21162 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
21163 const int HiLUT[16] = {
21164 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
21165 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
21166 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
21167 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
21169 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
21170 for (unsigned i = 0; i < NumElts; ++i) {
21171 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
21172 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
21175 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
21176 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
21177 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
21178 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
21179 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
21182 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
21183 unsigned NewOpc = 0;
21184 switch (N->getOpcode()) {
21185 case ISD::ATOMIC_LOAD_ADD:
21186 NewOpc = X86ISD::LADD;
21188 case ISD::ATOMIC_LOAD_SUB:
21189 NewOpc = X86ISD::LSUB;
21191 case ISD::ATOMIC_LOAD_OR:
21192 NewOpc = X86ISD::LOR;
21194 case ISD::ATOMIC_LOAD_XOR:
21195 NewOpc = X86ISD::LXOR;
21197 case ISD::ATOMIC_LOAD_AND:
21198 NewOpc = X86ISD::LAND;
21201 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
21204 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
21205 return DAG.getMemIntrinsicNode(
21206 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
21207 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
21208 /*MemVT=*/N->getSimpleValueType(0), MMO);
21211 /// Lower atomic_load_ops into LOCK-prefixed operations.
21212 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
21213 const X86Subtarget &Subtarget) {
21214 SDValue Chain = N->getOperand(0);
21215 SDValue LHS = N->getOperand(1);
21216 SDValue RHS = N->getOperand(2);
21217 unsigned Opc = N->getOpcode();
21218 MVT VT = N->getSimpleValueType(0);
21221 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
21222 // can only be lowered when the result is unused. They should have already
21223 // been transformed into a cmpxchg loop in AtomicExpand.
21224 if (N->hasAnyUseOfValue(0)) {
21225 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
21226 // select LXADD if LOCK_SUB can't be selected.
21227 if (Opc == ISD::ATOMIC_LOAD_SUB) {
21228 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
21229 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
21230 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
21231 RHS, AN->getMemOperand(), AN->getOrdering(),
21232 AN->getSynchScope());
21234 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
21235 "Used AtomicRMW ops other than Add should have been expanded!");
21239 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
21240 // RAUW the chain, but don't worry about the result, as it's unused.
21241 assert(!N->hasAnyUseOfValue(0));
21242 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
21246 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
21247 SDNode *Node = Op.getNode();
21249 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
21251 // Convert seq_cst store -> xchg
21252 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
21253 // FIXME: On 32-bit, store -> fist or movq would be more efficient
21254 // (The only way to get a 16-byte store is cmpxchg16b)
21255 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
21256 if (cast<AtomicSDNode>(Node)->getOrdering() ==
21257 AtomicOrdering::SequentiallyConsistent ||
21258 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
21259 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
21260 cast<AtomicSDNode>(Node)->getMemoryVT(),
21261 Node->getOperand(0),
21262 Node->getOperand(1), Node->getOperand(2),
21263 cast<AtomicSDNode>(Node)->getMemOperand(),
21264 cast<AtomicSDNode>(Node)->getOrdering(),
21265 cast<AtomicSDNode>(Node)->getSynchScope());
21266 return Swap.getValue(1);
21268 // Other atomic stores have a simple pattern.
21272 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
21273 MVT VT = Op.getNode()->getSimpleValueType(0);
21275 // Let legalize expand this if it isn't a legal type yet.
21276 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
21279 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21282 bool ExtraOp = false;
21283 switch (Op.getOpcode()) {
21284 default: llvm_unreachable("Invalid code");
21285 case ISD::ADDC: Opc = X86ISD::ADD; break;
21286 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
21287 case ISD::SUBC: Opc = X86ISD::SUB; break;
21288 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
21292 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21294 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21295 Op.getOperand(1), Op.getOperand(2));
21298 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
21299 SelectionDAG &DAG) {
21300 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
21302 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
21303 // which returns the values as { float, float } (in XMM0) or
21304 // { double, double } (which is returned in XMM0, XMM1).
21306 SDValue Arg = Op.getOperand(0);
21307 EVT ArgVT = Arg.getValueType();
21308 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21310 TargetLowering::ArgListTy Args;
21311 TargetLowering::ArgListEntry Entry;
21315 Entry.isSExt = false;
21316 Entry.isZExt = false;
21317 Args.push_back(Entry);
21319 bool isF64 = ArgVT == MVT::f64;
21320 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
21321 // the small struct {f32, f32} is returned in (eax, edx). For f64,
21322 // the results are returned via SRet in memory.
21323 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
21324 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21326 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
21328 Type *RetTy = isF64
21329 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
21330 : (Type*)VectorType::get(ArgTy, 4);
21332 TargetLowering::CallLoweringInfo CLI(DAG);
21333 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
21334 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
21336 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
21339 // Returned in xmm0 and xmm1.
21340 return CallResult.first;
21342 // Returned in bits 0:31 and 32:64 xmm0.
21343 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21344 CallResult.first, DAG.getIntPtrConstant(0, dl));
21345 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21346 CallResult.first, DAG.getIntPtrConstant(1, dl));
21347 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
21348 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
21351 /// Widen a vector input to a vector of NVT. The
21352 /// input vector must have the same element type as NVT.
21353 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
21354 bool FillWithZeroes = false) {
21355 // Check if InOp already has the right width.
21356 MVT InVT = InOp.getSimpleValueType();
21360 if (InOp.isUndef())
21361 return DAG.getUNDEF(NVT);
21363 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
21364 "input and widen element type must match");
21366 unsigned InNumElts = InVT.getVectorNumElements();
21367 unsigned WidenNumElts = NVT.getVectorNumElements();
21368 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
21369 "Unexpected request for vector widening");
21371 EVT EltVT = NVT.getVectorElementType();
21374 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
21375 InOp.getNumOperands() == 2) {
21376 SDValue N1 = InOp.getOperand(1);
21377 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
21379 InOp = InOp.getOperand(0);
21380 InVT = InOp.getSimpleValueType();
21381 InNumElts = InVT.getVectorNumElements();
21384 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
21385 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
21386 SmallVector<SDValue, 16> Ops;
21387 for (unsigned i = 0; i < InNumElts; ++i)
21388 Ops.push_back(InOp.getOperand(i));
21390 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
21391 DAG.getUNDEF(EltVT);
21392 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
21393 Ops.push_back(FillVal);
21394 return DAG.getBuildVector(NVT, dl, Ops);
21396 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
21398 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
21399 InOp, DAG.getIntPtrConstant(0, dl));
21402 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
21403 SelectionDAG &DAG) {
21404 assert(Subtarget.hasAVX512() &&
21405 "MGATHER/MSCATTER are supported on AVX-512 arch only");
21407 // X86 scatter kills mask register, so its type should be added to
21408 // the list of return values.
21409 // If the "scatter" has 2 return values, it is already handled.
21410 if (Op.getNode()->getNumValues() == 2)
21413 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
21414 SDValue Src = N->getValue();
21415 MVT VT = Src.getSimpleValueType();
21416 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
21419 SDValue NewScatter;
21420 SDValue Index = N->getIndex();
21421 SDValue Mask = N->getMask();
21422 SDValue Chain = N->getChain();
21423 SDValue BasePtr = N->getBasePtr();
21424 MVT MemVT = N->getMemoryVT().getSimpleVT();
21425 MVT IndexVT = Index.getSimpleValueType();
21426 MVT MaskVT = Mask.getSimpleValueType();
21428 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
21429 // The v2i32 value was promoted to v2i64.
21430 // Now we "redo" the type legalizer's work and widen the original
21431 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
21433 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
21434 "Unexpected memory type");
21435 int ShuffleMask[] = {0, 2, -1, -1};
21436 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
21437 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
21438 // Now we have 4 elements instead of 2.
21439 // Expand the index.
21440 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
21441 Index = ExtendToType(Index, NewIndexVT, DAG);
21443 // Expand the mask with zeroes
21444 // Mask may be <2 x i64> or <2 x i1> at this moment
21445 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
21446 "Unexpected mask type");
21447 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
21448 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21452 unsigned NumElts = VT.getVectorNumElements();
21453 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21454 !Index.getSimpleValueType().is512BitVector()) {
21455 // AVX512F supports only 512-bit vectors. Or data or index should
21456 // be 512 bit wide. If now the both index and data are 256-bit, but
21457 // the vector contains 8 elements, we just sign-extend the index
21458 if (IndexVT == MVT::v8i32)
21459 // Just extend index
21460 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21462 // The minimal number of elts in scatter is 8
21465 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21466 // Use original index here, do not modify the index twice
21467 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
21468 if (IndexVT.getScalarType() == MVT::i32)
21469 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21472 // At this point we have promoted mask operand
21473 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21474 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21475 // Use the original mask here, do not modify the mask twice
21476 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
21478 // The value that should be stored
21479 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21480 Src = ExtendToType(Src, NewVT, DAG);
21483 // If the mask is "wide" at this point - truncate it to i1 vector
21484 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
21485 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
21487 // The mask is killed by scatter, add it to the values
21488 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
21489 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
21490 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
21491 N->getMemOperand());
21492 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
21493 return SDValue(NewScatter.getNode(), 1);
21496 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
21497 SelectionDAG &DAG) {
21499 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
21500 MVT VT = Op.getSimpleValueType();
21501 MVT ScalarVT = VT.getScalarType();
21502 SDValue Mask = N->getMask();
21505 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21506 "Cannot lower masked load op.");
21508 assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21509 (Subtarget.hasBWI() &&
21510 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21511 "Unsupported masked load op.");
21513 // This operation is legal for targets with VLX, but without
21514 // VLX the vector should be widened to 512 bit
21515 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21516 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21517 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21518 SDValue Src0 = N->getSrc0();
21519 Src0 = ExtendToType(Src0, WideDataVT, DAG);
21520 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21521 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
21522 N->getBasePtr(), Mask, Src0,
21523 N->getMemoryVT(), N->getMemOperand(),
21524 N->getExtensionType());
21526 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21527 NewLoad.getValue(0),
21528 DAG.getIntPtrConstant(0, dl));
21529 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
21530 return DAG.getMergeValues(RetOps, dl);
21533 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
21534 SelectionDAG &DAG) {
21535 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
21536 SDValue DataToStore = N->getValue();
21537 MVT VT = DataToStore.getSimpleValueType();
21538 MVT ScalarVT = VT.getScalarType();
21539 SDValue Mask = N->getMask();
21542 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21543 "Cannot lower masked store op.");
21545 assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21546 (Subtarget.hasBWI() &&
21547 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21548 "Unsupported masked store op.");
21550 // This operation is legal for targets with VLX, but without
21551 // VLX the vector should be widened to 512 bit
21552 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21553 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21554 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21555 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
21556 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21557 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
21558 Mask, N->getMemoryVT(), N->getMemOperand(),
21559 N->isTruncatingStore());
21562 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
21563 SelectionDAG &DAG) {
21564 assert(Subtarget.hasAVX512() &&
21565 "MGATHER/MSCATTER are supported on AVX-512 arch only");
21567 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
21569 MVT VT = Op.getSimpleValueType();
21570 SDValue Index = N->getIndex();
21571 SDValue Mask = N->getMask();
21572 SDValue Src0 = N->getValue();
21573 MVT IndexVT = Index.getSimpleValueType();
21574 MVT MaskVT = Mask.getSimpleValueType();
21576 unsigned NumElts = VT.getVectorNumElements();
21577 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
21579 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21580 !Index.getSimpleValueType().is512BitVector()) {
21581 // AVX512F supports only 512-bit vectors. Or data or index should
21582 // be 512 bit wide. If now the both index and data are 256-bit, but
21583 // the vector contains 8 elements, we just sign-extend the index
21584 if (NumElts == 8) {
21585 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21586 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
21587 N->getOperand(3), Index };
21588 DAG.UpdateNodeOperands(N, Ops);
21592 // Minimal number of elements in Gather
21595 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21596 Index = ExtendToType(Index, NewIndexVT, DAG);
21597 if (IndexVT.getScalarType() == MVT::i32)
21598 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21601 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
21602 // At this point we have promoted mask operand
21603 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21604 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21605 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21606 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
21608 // The pass-thru value
21609 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21610 Src0 = ExtendToType(Src0, NewVT, DAG);
21612 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
21613 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
21614 N->getMemoryVT(), dl, Ops,
21615 N->getMemOperand());
21616 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21617 NewGather.getValue(0),
21618 DAG.getIntPtrConstant(0, dl));
21619 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
21620 return DAG.getMergeValues(RetOps, dl);
21625 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
21626 SelectionDAG &DAG) const {
21627 // TODO: Eventually, the lowering of these nodes should be informed by or
21628 // deferred to the GC strategy for the function in which they appear. For
21629 // now, however, they must be lowered to something. Since they are logically
21630 // no-ops in the case of a null GC strategy (or a GC strategy which does not
21631 // require special handling for these nodes), lower them as literal NOOPs for
21633 SmallVector<SDValue, 2> Ops;
21635 Ops.push_back(Op.getOperand(0));
21636 if (Op->getGluedNode())
21637 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21640 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21641 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21646 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
21647 SelectionDAG &DAG) const {
21648 // TODO: Eventually, the lowering of these nodes should be informed by or
21649 // deferred to the GC strategy for the function in which they appear. For
21650 // now, however, they must be lowered to something. Since they are logically
21651 // no-ops in the case of a null GC strategy (or a GC strategy which does not
21652 // require special handling for these nodes), lower them as literal NOOPs for
21654 SmallVector<SDValue, 2> Ops;
21656 Ops.push_back(Op.getOperand(0));
21657 if (Op->getGluedNode())
21658 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21661 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21662 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21667 /// Provide custom lowering hooks for some operations.
21668 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
21669 switch (Op.getOpcode()) {
21670 default: llvm_unreachable("Should not custom lower this!");
21671 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
21672 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
21673 return LowerCMP_SWAP(Op, Subtarget, DAG);
21674 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
21675 case ISD::ATOMIC_LOAD_ADD:
21676 case ISD::ATOMIC_LOAD_SUB:
21677 case ISD::ATOMIC_LOAD_OR:
21678 case ISD::ATOMIC_LOAD_XOR:
21679 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
21680 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
21681 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
21682 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
21683 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
21684 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
21685 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
21686 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
21687 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
21688 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
21689 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
21690 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
21691 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
21692 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
21693 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
21694 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
21695 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
21696 case ISD::SHL_PARTS:
21697 case ISD::SRA_PARTS:
21698 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
21699 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
21700 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
21701 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
21702 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
21703 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
21704 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
21705 case ISD::SIGN_EXTEND_VECTOR_INREG:
21706 return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
21707 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
21708 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
21709 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
21710 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
21712 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
21713 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
21714 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
21715 case ISD::SETCC: return LowerSETCC(Op, DAG);
21716 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
21717 case ISD::SELECT: return LowerSELECT(Op, DAG);
21718 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
21719 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
21720 case ISD::VASTART: return LowerVASTART(Op, DAG);
21721 case ISD::VAARG: return LowerVAARG(Op, DAG);
21722 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
21723 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
21724 case ISD::INTRINSIC_VOID:
21725 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
21726 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
21727 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
21728 case ISD::FRAME_TO_ARGS_OFFSET:
21729 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
21730 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
21731 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
21732 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
21733 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
21734 case ISD::EH_SJLJ_SETUP_DISPATCH:
21735 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
21736 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
21737 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
21738 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
21740 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
21742 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
21743 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
21745 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
21746 case ISD::UMUL_LOHI:
21747 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
21748 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
21751 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
21757 case ISD::UMULO: return LowerXALUO(Op, DAG);
21758 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
21759 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
21763 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
21764 case ISD::ADD: return LowerADD(Op, DAG);
21765 case ISD::SUB: return LowerSUB(Op, DAG);
21769 case ISD::UMIN: return LowerMINMAX(Op, DAG);
21770 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
21771 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
21772 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
21773 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
21774 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
21775 case ISD::GC_TRANSITION_START:
21776 return LowerGC_TRANSITION_START(Op, DAG);
21777 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
21778 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
21782 /// Places new result values for the node in Results (their number
21783 /// and types must exactly match those of the original return values of
21784 /// the node), or leaves Results empty, which indicates that the node is not
21785 /// to be custom lowered after all.
21786 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
21787 SmallVectorImpl<SDValue> &Results,
21788 SelectionDAG &DAG) const {
21789 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
21791 if (!Res.getNode())
21794 assert((N->getNumValues() <= Res->getNumValues()) &&
21795 "Lowering returned the wrong number of results!");
21797 // Places new result values base on N result number.
21798 // In some cases (LowerSINT_TO_FP for example) Res has more result values
21799 // than original node, chain should be dropped(last value).
21800 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
21801 Results.push_back(Res.getValue(I));
21804 /// Replace a node with an illegal result type with a new node built out of
21806 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
21807 SmallVectorImpl<SDValue>&Results,
21808 SelectionDAG &DAG) const {
21810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21811 switch (N->getOpcode()) {
21813 llvm_unreachable("Do not know how to custom type legalize this operation!");
21814 case X86ISD::AVG: {
21815 // Legalize types for X86ISD::AVG by expanding vectors.
21816 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21818 auto InVT = N->getValueType(0);
21819 auto InVTSize = InVT.getSizeInBits();
21820 const unsigned RegSize =
21821 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
21822 assert((!Subtarget.hasAVX512() || RegSize < 512) &&
21823 "512-bit vector requires AVX512");
21824 assert((!Subtarget.hasAVX2() || RegSize < 256) &&
21825 "256-bit vector requires AVX2");
21827 auto ElemVT = InVT.getVectorElementType();
21828 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
21829 RegSize / ElemVT.getSizeInBits());
21830 assert(RegSize % InVT.getSizeInBits() == 0);
21831 unsigned NumConcat = RegSize / InVT.getSizeInBits();
21833 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
21834 Ops[0] = N->getOperand(0);
21835 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21836 Ops[0] = N->getOperand(1);
21837 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21839 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
21840 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
21841 DAG.getIntPtrConstant(0, dl)));
21844 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
21845 case X86ISD::FMINC:
21847 case X86ISD::FMAXC:
21848 case X86ISD::FMAX: {
21849 EVT VT = N->getValueType(0);
21850 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
21851 SDValue UNDEF = DAG.getUNDEF(VT);
21852 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21853 N->getOperand(0), UNDEF);
21854 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21855 N->getOperand(1), UNDEF);
21856 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
21859 case ISD::SIGN_EXTEND_INREG:
21864 // We don't want to expand or promote these.
21871 case ISD::UDIVREM: {
21872 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
21873 Results.push_back(V);
21876 case ISD::FP_TO_SINT:
21877 case ISD::FP_TO_UINT: {
21878 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
21880 std::pair<SDValue,SDValue> Vals =
21881 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
21882 SDValue FIST = Vals.first, StackSlot = Vals.second;
21883 if (FIST.getNode()) {
21884 EVT VT = N->getValueType(0);
21885 // Return a load from the stack slot.
21886 if (StackSlot.getNode())
21888 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
21890 Results.push_back(FIST);
21894 case ISD::UINT_TO_FP: {
21895 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21896 if (N->getOperand(0).getValueType() != MVT::v2i32 ||
21897 N->getValueType(0) != MVT::v2f32)
21899 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
21902 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
21903 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
21904 DAG.getBitcast(MVT::v2i64, VBias));
21905 Or = DAG.getBitcast(MVT::v2f64, Or);
21906 // TODO: Are there any fast-math-flags to propagate here?
21907 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
21908 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
21911 case ISD::FP_ROUND: {
21912 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
21914 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
21915 Results.push_back(V);
21918 case ISD::FP_EXTEND: {
21919 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
21920 // No other ValueType for FP_EXTEND should reach this point.
21921 assert(N->getValueType(0) == MVT::v2f32 &&
21922 "Do not know how to legalize this Node");
21925 case ISD::INTRINSIC_W_CHAIN: {
21926 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
21928 default : llvm_unreachable("Do not know how to custom type "
21929 "legalize this intrinsic operation!");
21930 case Intrinsic::x86_rdtsc:
21931 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21933 case Intrinsic::x86_rdtscp:
21934 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
21936 case Intrinsic::x86_rdpmc:
21937 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
21940 case ISD::INTRINSIC_WO_CHAIN: {
21941 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
21942 Results.push_back(V);
21945 case ISD::READCYCLECOUNTER: {
21946 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21949 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
21950 EVT T = N->getValueType(0);
21951 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
21952 bool Regs64bit = T == MVT::i128;
21953 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
21954 SDValue cpInL, cpInH;
21955 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21956 DAG.getConstant(0, dl, HalfT));
21957 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21958 DAG.getConstant(1, dl, HalfT));
21959 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
21960 Regs64bit ? X86::RAX : X86::EAX,
21962 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
21963 Regs64bit ? X86::RDX : X86::EDX,
21964 cpInH, cpInL.getValue(1));
21965 SDValue swapInL, swapInH;
21966 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21967 DAG.getConstant(0, dl, HalfT));
21968 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21969 DAG.getConstant(1, dl, HalfT));
21971 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
21972 swapInH, cpInH.getValue(1));
21973 // If the current function needs the base pointer, RBX,
21974 // we shouldn't use cmpxchg directly.
21975 // Indeed the lowering of that instruction will clobber
21976 // that register and since RBX will be a reserved register
21977 // the register allocator will not make sure its value will
21978 // be properly saved and restored around this live-range.
21979 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
21981 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21982 unsigned BasePtr = TRI->getBaseRegister();
21983 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
21984 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
21985 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
21986 // ISel prefers the LCMPXCHG64 variant.
21987 // If that assert breaks, that means it is not the case anymore,
21988 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
21989 // not just EBX. This is a matter of accepting i64 input for that
21990 // pseudo, and restoring into the register of the right wide
21991 // in expand pseudo. Everything else should just work.
21992 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
21993 "Saving only half of the RBX");
21994 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
21995 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
21996 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
21997 Regs64bit ? X86::RBX : X86::EBX,
21998 HalfT, swapInH.getValue(1));
21999 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
22001 /*Glue*/ RBXSave.getValue(2)};
22002 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
22005 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
22006 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
22007 Regs64bit ? X86::RBX : X86::EBX, swapInL,
22008 swapInH.getValue(1));
22009 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
22010 swapInL.getValue(1)};
22011 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
22013 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
22014 Regs64bit ? X86::RAX : X86::EAX,
22015 HalfT, Result.getValue(1));
22016 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
22017 Regs64bit ? X86::RDX : X86::EDX,
22018 HalfT, cpOutL.getValue(2));
22019 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
22021 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
22022 MVT::i32, cpOutH.getValue(2));
22024 DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22025 DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
22026 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
22028 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
22029 Results.push_back(Success);
22030 Results.push_back(EFLAGS.getValue(1));
22033 case ISD::ATOMIC_SWAP:
22034 case ISD::ATOMIC_LOAD_ADD:
22035 case ISD::ATOMIC_LOAD_SUB:
22036 case ISD::ATOMIC_LOAD_AND:
22037 case ISD::ATOMIC_LOAD_OR:
22038 case ISD::ATOMIC_LOAD_XOR:
22039 case ISD::ATOMIC_LOAD_NAND:
22040 case ISD::ATOMIC_LOAD_MIN:
22041 case ISD::ATOMIC_LOAD_MAX:
22042 case ISD::ATOMIC_LOAD_UMIN:
22043 case ISD::ATOMIC_LOAD_UMAX:
22044 case ISD::ATOMIC_LOAD: {
22045 // Delegate to generic TypeLegalization. Situations we can really handle
22046 // should have already been dealt with by AtomicExpandPass.cpp.
22049 case ISD::BITCAST: {
22050 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22051 EVT DstVT = N->getValueType(0);
22052 EVT SrcVT = N->getOperand(0)->getValueType(0);
22054 if (SrcVT != MVT::f64 ||
22055 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
22058 unsigned NumElts = DstVT.getVectorNumElements();
22059 EVT SVT = DstVT.getVectorElementType();
22060 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22061 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
22062 MVT::v2f64, N->getOperand(0));
22063 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
22065 if (ExperimentalVectorWideningLegalization) {
22066 // If we are legalizing vectors by widening, we already have the desired
22067 // legal vector type, just return it.
22068 Results.push_back(ToVecInt);
22072 SmallVector<SDValue, 8> Elts;
22073 for (unsigned i = 0, e = NumElts; i != e; ++i)
22074 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
22075 ToVecInt, DAG.getIntPtrConstant(i, dl)));
22077 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
22082 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
22083 switch ((X86ISD::NodeType)Opcode) {
22084 case X86ISD::FIRST_NUMBER: break;
22085 case X86ISD::BSF: return "X86ISD::BSF";
22086 case X86ISD::BSR: return "X86ISD::BSR";
22087 case X86ISD::SHLD: return "X86ISD::SHLD";
22088 case X86ISD::SHRD: return "X86ISD::SHRD";
22089 case X86ISD::FAND: return "X86ISD::FAND";
22090 case X86ISD::FANDN: return "X86ISD::FANDN";
22091 case X86ISD::FOR: return "X86ISD::FOR";
22092 case X86ISD::FXOR: return "X86ISD::FXOR";
22093 case X86ISD::FILD: return "X86ISD::FILD";
22094 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
22095 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
22096 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
22097 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
22098 case X86ISD::FLD: return "X86ISD::FLD";
22099 case X86ISD::FST: return "X86ISD::FST";
22100 case X86ISD::CALL: return "X86ISD::CALL";
22101 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
22102 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
22103 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
22104 case X86ISD::BT: return "X86ISD::BT";
22105 case X86ISD::CMP: return "X86ISD::CMP";
22106 case X86ISD::COMI: return "X86ISD::COMI";
22107 case X86ISD::UCOMI: return "X86ISD::UCOMI";
22108 case X86ISD::CMPM: return "X86ISD::CMPM";
22109 case X86ISD::CMPMU: return "X86ISD::CMPMU";
22110 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
22111 case X86ISD::SETCC: return "X86ISD::SETCC";
22112 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
22113 case X86ISD::FSETCC: return "X86ISD::FSETCC";
22114 case X86ISD::CMOV: return "X86ISD::CMOV";
22115 case X86ISD::BRCOND: return "X86ISD::BRCOND";
22116 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
22117 case X86ISD::IRET: return "X86ISD::IRET";
22118 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
22119 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
22120 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
22121 case X86ISD::Wrapper: return "X86ISD::Wrapper";
22122 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
22123 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
22124 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
22125 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
22126 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
22127 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
22128 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
22129 case X86ISD::PINSRB: return "X86ISD::PINSRB";
22130 case X86ISD::PINSRW: return "X86ISD::PINSRW";
22131 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
22132 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
22133 case X86ISD::ANDNP: return "X86ISD::ANDNP";
22134 case X86ISD::BLENDI: return "X86ISD::BLENDI";
22135 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
22136 case X86ISD::ADDUS: return "X86ISD::ADDUS";
22137 case X86ISD::SUBUS: return "X86ISD::SUBUS";
22138 case X86ISD::HADD: return "X86ISD::HADD";
22139 case X86ISD::HSUB: return "X86ISD::HSUB";
22140 case X86ISD::FHADD: return "X86ISD::FHADD";
22141 case X86ISD::FHSUB: return "X86ISD::FHSUB";
22142 case X86ISD::ABS: return "X86ISD::ABS";
22143 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
22144 case X86ISD::FMAX: return "X86ISD::FMAX";
22145 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
22146 case X86ISD::FMIN: return "X86ISD::FMIN";
22147 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
22148 case X86ISD::FMAXC: return "X86ISD::FMAXC";
22149 case X86ISD::FMINC: return "X86ISD::FMINC";
22150 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
22151 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
22152 case X86ISD::FRCP: return "X86ISD::FRCP";
22153 case X86ISD::FRCPS: return "X86ISD::FRCPS";
22154 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
22155 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
22156 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
22157 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
22158 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
22159 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
22160 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
22161 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
22162 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
22163 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
22164 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
22165 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
22166 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
22167 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
22168 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
22169 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
22170 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
22171 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
22172 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
22173 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
22174 case X86ISD::LADD: return "X86ISD::LADD";
22175 case X86ISD::LSUB: return "X86ISD::LSUB";
22176 case X86ISD::LOR: return "X86ISD::LOR";
22177 case X86ISD::LXOR: return "X86ISD::LXOR";
22178 case X86ISD::LAND: return "X86ISD::LAND";
22179 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
22180 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
22181 case X86ISD::VZEXT: return "X86ISD::VZEXT";
22182 case X86ISD::VSEXT: return "X86ISD::VSEXT";
22183 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
22184 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
22185 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
22186 case X86ISD::VINSERT: return "X86ISD::VINSERT";
22187 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
22188 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
22189 case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD";
22190 case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD";
22191 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
22192 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
22193 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
22194 case X86ISD::VSHL: return "X86ISD::VSHL";
22195 case X86ISD::VSRL: return "X86ISD::VSRL";
22196 case X86ISD::VSRA: return "X86ISD::VSRA";
22197 case X86ISD::VSHLI: return "X86ISD::VSHLI";
22198 case X86ISD::VSRLI: return "X86ISD::VSRLI";
22199 case X86ISD::VSRAI: return "X86ISD::VSRAI";
22200 case X86ISD::VSRAV: return "X86ISD::VSRAV";
22201 case X86ISD::VROTLI: return "X86ISD::VROTLI";
22202 case X86ISD::VROTRI: return "X86ISD::VROTRI";
22203 case X86ISD::VPPERM: return "X86ISD::VPPERM";
22204 case X86ISD::CMPP: return "X86ISD::CMPP";
22205 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
22206 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
22207 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
22208 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
22209 case X86ISD::ADD: return "X86ISD::ADD";
22210 case X86ISD::SUB: return "X86ISD::SUB";
22211 case X86ISD::ADC: return "X86ISD::ADC";
22212 case X86ISD::SBB: return "X86ISD::SBB";
22213 case X86ISD::SMUL: return "X86ISD::SMUL";
22214 case X86ISD::UMUL: return "X86ISD::UMUL";
22215 case X86ISD::SMUL8: return "X86ISD::SMUL8";
22216 case X86ISD::UMUL8: return "X86ISD::UMUL8";
22217 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
22218 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
22219 case X86ISD::INC: return "X86ISD::INC";
22220 case X86ISD::DEC: return "X86ISD::DEC";
22221 case X86ISD::OR: return "X86ISD::OR";
22222 case X86ISD::XOR: return "X86ISD::XOR";
22223 case X86ISD::AND: return "X86ISD::AND";
22224 case X86ISD::BEXTR: return "X86ISD::BEXTR";
22225 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
22226 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
22227 case X86ISD::PTEST: return "X86ISD::PTEST";
22228 case X86ISD::TESTP: return "X86ISD::TESTP";
22229 case X86ISD::TESTM: return "X86ISD::TESTM";
22230 case X86ISD::TESTNM: return "X86ISD::TESTNM";
22231 case X86ISD::KORTEST: return "X86ISD::KORTEST";
22232 case X86ISD::KTEST: return "X86ISD::KTEST";
22233 case X86ISD::PACKSS: return "X86ISD::PACKSS";
22234 case X86ISD::PACKUS: return "X86ISD::PACKUS";
22235 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
22236 case X86ISD::VALIGN: return "X86ISD::VALIGN";
22237 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
22238 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
22239 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
22240 case X86ISD::SHUFP: return "X86ISD::SHUFP";
22241 case X86ISD::SHUF128: return "X86ISD::SHUF128";
22242 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
22243 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
22244 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
22245 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
22246 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
22247 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
22248 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
22249 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
22250 case X86ISD::MOVSD: return "X86ISD::MOVSD";
22251 case X86ISD::MOVSS: return "X86ISD::MOVSS";
22252 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
22253 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
22254 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
22255 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
22256 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
22257 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
22258 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
22259 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
22260 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
22261 case X86ISD::VPERMV: return "X86ISD::VPERMV";
22262 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
22263 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
22264 case X86ISD::VPERMI: return "X86ISD::VPERMI";
22265 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
22266 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
22267 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
22268 case X86ISD::VRANGE: return "X86ISD::VRANGE";
22269 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
22270 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
22271 case X86ISD::PSADBW: return "X86ISD::PSADBW";
22272 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
22273 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
22274 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
22275 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
22276 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
22277 case X86ISD::MFENCE: return "X86ISD::MFENCE";
22278 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
22279 case X86ISD::SAHF: return "X86ISD::SAHF";
22280 case X86ISD::RDRAND: return "X86ISD::RDRAND";
22281 case X86ISD::RDSEED: return "X86ISD::RDSEED";
22282 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
22283 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
22284 case X86ISD::VPROT: return "X86ISD::VPROT";
22285 case X86ISD::VPROTI: return "X86ISD::VPROTI";
22286 case X86ISD::VPSHA: return "X86ISD::VPSHA";
22287 case X86ISD::VPSHL: return "X86ISD::VPSHL";
22288 case X86ISD::VPCOM: return "X86ISD::VPCOM";
22289 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
22290 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
22291 case X86ISD::FMADD: return "X86ISD::FMADD";
22292 case X86ISD::FMSUB: return "X86ISD::FMSUB";
22293 case X86ISD::FNMADD: return "X86ISD::FNMADD";
22294 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
22295 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
22296 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
22297 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
22298 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
22299 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
22300 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
22301 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
22302 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
22303 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
22304 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
22305 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
22306 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
22307 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
22308 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
22309 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
22310 case X86ISD::XTEST: return "X86ISD::XTEST";
22311 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
22312 case X86ISD::EXPAND: return "X86ISD::EXPAND";
22313 case X86ISD::SELECT: return "X86ISD::SELECT";
22314 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
22315 case X86ISD::RCP28: return "X86ISD::RCP28";
22316 case X86ISD::EXP2: return "X86ISD::EXP2";
22317 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
22318 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
22319 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
22320 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
22321 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
22322 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
22323 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
22324 case X86ISD::SCALEF: return "X86ISD::SCALEF";
22325 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
22326 case X86ISD::ADDS: return "X86ISD::ADDS";
22327 case X86ISD::SUBS: return "X86ISD::SUBS";
22328 case X86ISD::AVG: return "X86ISD::AVG";
22329 case X86ISD::MULHRS: return "X86ISD::MULHRS";
22330 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
22331 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
22332 case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND";
22333 case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND";
22334 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
22335 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
22336 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
22337 case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
22338 case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
22343 /// Return true if the addressing mode represented by AM is legal for this
22344 /// target, for a load/store of the specified type.
22345 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
22346 const AddrMode &AM, Type *Ty,
22347 unsigned AS) const {
22348 // X86 supports extremely general addressing modes.
22349 CodeModel::Model M = getTargetMachine().getCodeModel();
22351 // X86 allows a sign-extended 32-bit immediate field as a displacement.
22352 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
22356 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
22358 // If a reference to this global requires an extra load, we can't fold it.
22359 if (isGlobalStubReference(GVFlags))
22362 // If BaseGV requires a register for the PIC base, we cannot also have a
22363 // BaseReg specified.
22364 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
22367 // If lower 4G is not available, then we must use rip-relative addressing.
22368 if ((M != CodeModel::Small || isPositionIndependent()) &&
22369 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
22373 switch (AM.Scale) {
22379 // These scales always work.
22384 // These scales are formed with basereg+scalereg. Only accept if there is
22389 default: // Other stuff never works.
22396 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
22397 unsigned Bits = Ty->getScalarSizeInBits();
22399 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
22400 // particularly cheaper than those without.
22404 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
22405 // variable shifts just as cheap as scalar ones.
22406 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
22409 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
22410 // fully general vector.
22414 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
22415 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22417 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
22418 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
22419 return NumBits1 > NumBits2;
22422 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
22423 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22426 if (!isTypeLegal(EVT::getEVT(Ty1)))
22429 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
22431 // Assuming the caller doesn't have a zeroext or signext return parameter,
22432 // truncation all the way down to i1 is valid.
22436 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
22437 return isInt<32>(Imm);
22440 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
22441 // Can also use sub to handle negated immediates.
22442 return isInt<32>(Imm);
22445 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
22446 if (!VT1.isInteger() || !VT2.isInteger())
22448 unsigned NumBits1 = VT1.getSizeInBits();
22449 unsigned NumBits2 = VT2.getSizeInBits();
22450 return NumBits1 > NumBits2;
22453 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
22454 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22455 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
22458 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
22459 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22460 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
22463 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
22464 EVT VT1 = Val.getValueType();
22465 if (isZExtFree(VT1, VT2))
22468 if (Val.getOpcode() != ISD::LOAD)
22471 if (!VT1.isSimple() || !VT1.isInteger() ||
22472 !VT2.isSimple() || !VT2.isInteger())
22475 switch (VT1.getSimpleVT().SimpleTy) {
22480 // X86 has 8, 16, and 32-bit zero-extending loads.
22487 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
22490 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
22491 if (!Subtarget.hasAnyFMA())
22494 VT = VT.getScalarType();
22496 if (!VT.isSimple())
22499 switch (VT.getSimpleVT().SimpleTy) {
22510 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
22511 // i16 instructions are longer (0x66 prefix) and potentially slower.
22512 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
22515 /// Targets can use this to indicate that they only support *some*
22516 /// VECTOR_SHUFFLE operations, those with specific masks.
22517 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
22518 /// are assumed to be legal.
22520 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
22522 if (!VT.isSimple())
22525 // Not for i1 vectors
22526 if (VT.getSimpleVT().getScalarType() == MVT::i1)
22529 // Very little shuffling can be done for 64-bit vectors right now.
22530 if (VT.getSimpleVT().getSizeInBits() == 64)
22533 // We only care that the types being shuffled are legal. The lowering can
22534 // handle any possible shuffle mask that results.
22535 return isTypeLegal(VT.getSimpleVT());
22539 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
22541 // Just delegate to the generic legality, clear masks aren't special.
22542 return isShuffleMaskLegal(Mask, VT);
22545 //===----------------------------------------------------------------------===//
22546 // X86 Scheduler Hooks
22547 //===----------------------------------------------------------------------===//
22549 /// Utility function to emit xbegin specifying the start of an RTM region.
22550 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
22551 const TargetInstrInfo *TII) {
22552 DebugLoc DL = MI.getDebugLoc();
22554 const BasicBlock *BB = MBB->getBasicBlock();
22555 MachineFunction::iterator I = ++MBB->getIterator();
22557 // For the v = xbegin(), we generate
22568 MachineBasicBlock *thisMBB = MBB;
22569 MachineFunction *MF = MBB->getParent();
22570 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
22571 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
22572 MF->insert(I, mainMBB);
22573 MF->insert(I, sinkMBB);
22575 // Transfer the remainder of BB and its successor edges to sinkMBB.
22576 sinkMBB->splice(sinkMBB->begin(), MBB,
22577 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
22578 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
22582 // # fallthrough to mainMBB
22583 // # abortion to sinkMBB
22584 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
22585 thisMBB->addSuccessor(mainMBB);
22586 thisMBB->addSuccessor(sinkMBB);
22590 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
22591 mainMBB->addSuccessor(sinkMBB);
22594 // EAX is live into the sinkMBB
22595 sinkMBB->addLiveIn(X86::EAX);
22596 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
22597 MI.getOperand(0).getReg())
22600 MI.eraseFromParent();
22604 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
22605 // or XMM0_V32I8 in AVX all of this code can be replaced with that
22606 // in the .td file.
22607 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
22608 const TargetInstrInfo *TII) {
22610 switch (MI.getOpcode()) {
22611 default: llvm_unreachable("illegal opcode!");
22612 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
22613 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
22614 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
22615 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
22616 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
22617 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
22618 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
22619 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
22622 DebugLoc dl = MI.getDebugLoc();
22623 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22625 unsigned NumArgs = MI.getNumOperands();
22626 for (unsigned i = 1; i < NumArgs; ++i) {
22627 MachineOperand &Op = MI.getOperand(i);
22628 if (!(Op.isReg() && Op.isImplicit()))
22629 MIB.addOperand(Op);
22631 if (MI.hasOneMemOperand())
22632 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22634 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22635 .addReg(X86::XMM0);
22637 MI.eraseFromParent();
22641 // FIXME: Custom handling because TableGen doesn't support multiple implicit
22642 // defs in an instruction pattern
22643 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
22644 const TargetInstrInfo *TII) {
22646 switch (MI.getOpcode()) {
22647 default: llvm_unreachable("illegal opcode!");
22648 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
22649 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
22650 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
22651 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
22652 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
22653 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
22654 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
22655 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
22658 DebugLoc dl = MI.getDebugLoc();
22659 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22661 unsigned NumArgs = MI.getNumOperands(); // remove the results
22662 for (unsigned i = 1; i < NumArgs; ++i) {
22663 MachineOperand &Op = MI.getOperand(i);
22664 if (!(Op.isReg() && Op.isImplicit()))
22665 MIB.addOperand(Op);
22667 if (MI.hasOneMemOperand())
22668 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22670 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22673 MI.eraseFromParent();
22677 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22678 const X86Subtarget &Subtarget) {
22679 DebugLoc dl = MI.getDebugLoc();
22680 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22682 // insert input VAL into EAX
22683 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
22684 .addReg(MI.getOperand(0).getReg());
22685 // insert zero to ECX
22686 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22688 // insert zero to EDX
22689 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
22691 // insert WRPKRU instruction
22692 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
22694 MI.eraseFromParent(); // The pseudo is gone now.
22698 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22699 const X86Subtarget &Subtarget) {
22700 DebugLoc dl = MI.getDebugLoc();
22701 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22703 // insert zero to ECX
22704 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22706 // insert RDPKRU instruction
22707 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
22708 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22711 MI.eraseFromParent(); // The pseudo is gone now.
22715 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
22716 const X86Subtarget &Subtarget,
22718 DebugLoc dl = MI.getDebugLoc();
22719 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22720 // Address into RAX/EAX, other two args into ECX, EDX.
22721 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
22722 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
22723 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
22724 for (int i = 0; i < X86::AddrNumOperands; ++i)
22725 MIB.addOperand(MI.getOperand(i));
22727 unsigned ValOps = X86::AddrNumOperands;
22728 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
22729 .addReg(MI.getOperand(ValOps).getReg());
22730 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
22731 .addReg(MI.getOperand(ValOps + 1).getReg());
22733 // The instruction doesn't actually take any operands though.
22734 BuildMI(*BB, MI, dl, TII->get(Opc));
22736 MI.eraseFromParent(); // The pseudo is gone now.
22740 MachineBasicBlock *
22741 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
22742 MachineBasicBlock *MBB) const {
22743 // Emit va_arg instruction on X86-64.
22745 // Operands to this pseudo-instruction:
22746 // 0 ) Output : destination address (reg)
22747 // 1-5) Input : va_list address (addr, i64mem)
22748 // 6 ) ArgSize : Size (in bytes) of vararg type
22749 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
22750 // 8 ) Align : Alignment of type
22751 // 9 ) EFLAGS (implicit-def)
22753 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
22754 static_assert(X86::AddrNumOperands == 5,
22755 "VAARG_64 assumes 5 address operands");
22757 unsigned DestReg = MI.getOperand(0).getReg();
22758 MachineOperand &Base = MI.getOperand(1);
22759 MachineOperand &Scale = MI.getOperand(2);
22760 MachineOperand &Index = MI.getOperand(3);
22761 MachineOperand &Disp = MI.getOperand(4);
22762 MachineOperand &Segment = MI.getOperand(5);
22763 unsigned ArgSize = MI.getOperand(6).getImm();
22764 unsigned ArgMode = MI.getOperand(7).getImm();
22765 unsigned Align = MI.getOperand(8).getImm();
22767 // Memory Reference
22768 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
22769 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
22770 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
22772 // Machine Information
22773 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22774 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
22775 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
22776 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
22777 DebugLoc DL = MI.getDebugLoc();
22779 // struct va_list {
22782 // i64 overflow_area (address)
22783 // i64 reg_save_area (address)
22785 // sizeof(va_list) = 24
22786 // alignment(va_list) = 8
22788 unsigned TotalNumIntRegs = 6;
22789 unsigned TotalNumXMMRegs = 8;
22790 bool UseGPOffset = (ArgMode == 1);
22791 bool UseFPOffset = (ArgMode == 2);
22792 unsigned MaxOffset = TotalNumIntRegs * 8 +
22793 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
22795 /* Align ArgSize to a multiple of 8 */
22796 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
22797 bool NeedsAlign = (Align > 8);
22799 MachineBasicBlock *thisMBB = MBB;
22800 MachineBasicBlock *overflowMBB;
22801 MachineBasicBlock *offsetMBB;
22802 MachineBasicBlock *endMBB;
22804 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
22805 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
22806 unsigned OffsetReg = 0;
22808 if (!UseGPOffset && !UseFPOffset) {
22809 // If we only pull from the overflow region, we don't create a branch.
22810 // We don't need to alter control flow.
22811 OffsetDestReg = 0; // unused
22812 OverflowDestReg = DestReg;
22814 offsetMBB = nullptr;
22815 overflowMBB = thisMBB;
22818 // First emit code to check if gp_offset (or fp_offset) is below the bound.
22819 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
22820 // If not, pull from overflow_area. (branch to overflowMBB)
22825 // offsetMBB overflowMBB
22830 // Registers for the PHI in endMBB
22831 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
22832 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
22834 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
22835 MachineFunction *MF = MBB->getParent();
22836 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22837 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22838 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22840 MachineFunction::iterator MBBIter = ++MBB->getIterator();
22842 // Insert the new basic blocks
22843 MF->insert(MBBIter, offsetMBB);
22844 MF->insert(MBBIter, overflowMBB);
22845 MF->insert(MBBIter, endMBB);
22847 // Transfer the remainder of MBB and its successor edges to endMBB.
22848 endMBB->splice(endMBB->begin(), thisMBB,
22849 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
22850 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
22852 // Make offsetMBB and overflowMBB successors of thisMBB
22853 thisMBB->addSuccessor(offsetMBB);
22854 thisMBB->addSuccessor(overflowMBB);
22856 // endMBB is a successor of both offsetMBB and overflowMBB
22857 offsetMBB->addSuccessor(endMBB);
22858 overflowMBB->addSuccessor(endMBB);
22860 // Load the offset value into a register
22861 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22862 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
22866 .addDisp(Disp, UseFPOffset ? 4 : 0)
22867 .addOperand(Segment)
22868 .setMemRefs(MMOBegin, MMOEnd);
22870 // Check if there is enough room left to pull this argument.
22871 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
22873 .addImm(MaxOffset + 8 - ArgSizeA8);
22875 // Branch to "overflowMBB" if offset >= max
22876 // Fall through to "offsetMBB" otherwise
22877 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
22878 .addMBB(overflowMBB);
22881 // In offsetMBB, emit code to use the reg_save_area.
22883 assert(OffsetReg != 0);
22885 // Read the reg_save_area address.
22886 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
22887 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
22892 .addOperand(Segment)
22893 .setMemRefs(MMOBegin, MMOEnd);
22895 // Zero-extend the offset
22896 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
22897 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
22900 .addImm(X86::sub_32bit);
22902 // Add the offset to the reg_save_area to get the final address.
22903 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
22904 .addReg(OffsetReg64)
22905 .addReg(RegSaveReg);
22907 // Compute the offset for the next argument
22908 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22909 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
22911 .addImm(UseFPOffset ? 16 : 8);
22913 // Store it back into the va_list.
22914 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
22918 .addDisp(Disp, UseFPOffset ? 4 : 0)
22919 .addOperand(Segment)
22920 .addReg(NextOffsetReg)
22921 .setMemRefs(MMOBegin, MMOEnd);
22924 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
22929 // Emit code to use overflow area
22932 // Load the overflow_area address into a register.
22933 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
22934 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
22939 .addOperand(Segment)
22940 .setMemRefs(MMOBegin, MMOEnd);
22942 // If we need to align it, do so. Otherwise, just copy the address
22943 // to OverflowDestReg.
22945 // Align the overflow address
22946 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
22947 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
22949 // aligned_addr = (addr + (align-1)) & ~(align-1)
22950 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
22951 .addReg(OverflowAddrReg)
22954 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
22956 .addImm(~(uint64_t)(Align-1));
22958 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
22959 .addReg(OverflowAddrReg);
22962 // Compute the next overflow address after this argument.
22963 // (the overflow address should be kept 8-byte aligned)
22964 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
22965 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
22966 .addReg(OverflowDestReg)
22967 .addImm(ArgSizeA8);
22969 // Store the new overflow address.
22970 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
22975 .addOperand(Segment)
22976 .addReg(NextAddrReg)
22977 .setMemRefs(MMOBegin, MMOEnd);
22979 // If we branched, emit the PHI to the front of endMBB.
22981 BuildMI(*endMBB, endMBB->begin(), DL,
22982 TII->get(X86::PHI), DestReg)
22983 .addReg(OffsetDestReg).addMBB(offsetMBB)
22984 .addReg(OverflowDestReg).addMBB(overflowMBB);
22987 // Erase the pseudo instruction
22988 MI.eraseFromParent();
22993 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
22994 MachineInstr &MI, MachineBasicBlock *MBB) const {
22995 // Emit code to save XMM registers to the stack. The ABI says that the
22996 // number of registers to save is given in %al, so it's theoretically
22997 // possible to do an indirect jump trick to avoid saving all of them,
22998 // however this code takes a simpler approach and just executes all
22999 // of the stores if %al is non-zero. It's less code, and it's probably
23000 // easier on the hardware branch predictor, and stores aren't all that
23001 // expensive anyway.
23003 // Create the new basic blocks. One block contains all the XMM stores,
23004 // and one block is the final destination regardless of whether any
23005 // stores were performed.
23006 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
23007 MachineFunction *F = MBB->getParent();
23008 MachineFunction::iterator MBBIter = ++MBB->getIterator();
23009 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
23010 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
23011 F->insert(MBBIter, XMMSaveMBB);
23012 F->insert(MBBIter, EndMBB);
23014 // Transfer the remainder of MBB and its successor edges to EndMBB.
23015 EndMBB->splice(EndMBB->begin(), MBB,
23016 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23017 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
23019 // The original block will now fall through to the XMM save block.
23020 MBB->addSuccessor(XMMSaveMBB);
23021 // The XMMSaveMBB will fall through to the end block.
23022 XMMSaveMBB->addSuccessor(EndMBB);
23024 // Now add the instructions.
23025 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23026 DebugLoc DL = MI.getDebugLoc();
23028 unsigned CountReg = MI.getOperand(0).getReg();
23029 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
23030 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
23032 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
23033 // If %al is 0, branch around the XMM save block.
23034 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
23035 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
23036 MBB->addSuccessor(EndMBB);
23039 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
23040 // that was just emitted, but clearly shouldn't be "saved".
23041 assert((MI.getNumOperands() <= 3 ||
23042 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
23043 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
23044 "Expected last argument to be EFLAGS");
23045 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
23046 // In the XMM save block, save all the XMM argument registers.
23047 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
23048 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
23049 MachineMemOperand *MMO = F->getMachineMemOperand(
23050 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
23051 MachineMemOperand::MOStore,
23052 /*Size=*/16, /*Align=*/16);
23053 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
23054 .addFrameIndex(RegSaveFrameIndex)
23055 .addImm(/*Scale=*/1)
23056 .addReg(/*IndexReg=*/0)
23057 .addImm(/*Disp=*/Offset)
23058 .addReg(/*Segment=*/0)
23059 .addReg(MI.getOperand(i).getReg())
23060 .addMemOperand(MMO);
23063 MI.eraseFromParent(); // The pseudo instruction is gone now.
23068 // The EFLAGS operand of SelectItr might be missing a kill marker
23069 // because there were multiple uses of EFLAGS, and ISel didn't know
23070 // which to mark. Figure out whether SelectItr should have had a
23071 // kill marker, and set it if it should. Returns the correct kill
23073 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
23074 MachineBasicBlock* BB,
23075 const TargetRegisterInfo* TRI) {
23076 // Scan forward through BB for a use/def of EFLAGS.
23077 MachineBasicBlock::iterator miI(std::next(SelectItr));
23078 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
23079 const MachineInstr& mi = *miI;
23080 if (mi.readsRegister(X86::EFLAGS))
23082 if (mi.definesRegister(X86::EFLAGS))
23083 break; // Should have kill-flag - update below.
23086 // If we hit the end of the block, check whether EFLAGS is live into a
23088 if (miI == BB->end()) {
23089 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
23090 sEnd = BB->succ_end();
23091 sItr != sEnd; ++sItr) {
23092 MachineBasicBlock* succ = *sItr;
23093 if (succ->isLiveIn(X86::EFLAGS))
23098 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
23099 // out. SelectMI should have a kill flag on EFLAGS.
23100 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
23104 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
23105 // together with other CMOV pseudo-opcodes into a single basic-block with
23106 // conditional jump around it.
23107 static bool isCMOVPseudo(MachineInstr &MI) {
23108 switch (MI.getOpcode()) {
23109 case X86::CMOV_FR32:
23110 case X86::CMOV_FR64:
23111 case X86::CMOV_GR8:
23112 case X86::CMOV_GR16:
23113 case X86::CMOV_GR32:
23114 case X86::CMOV_RFP32:
23115 case X86::CMOV_RFP64:
23116 case X86::CMOV_RFP80:
23117 case X86::CMOV_V2F64:
23118 case X86::CMOV_V2I64:
23119 case X86::CMOV_V4F32:
23120 case X86::CMOV_V4F64:
23121 case X86::CMOV_V4I64:
23122 case X86::CMOV_V16F32:
23123 case X86::CMOV_V8F32:
23124 case X86::CMOV_V8F64:
23125 case X86::CMOV_V8I64:
23126 case X86::CMOV_V8I1:
23127 case X86::CMOV_V16I1:
23128 case X86::CMOV_V32I1:
23129 case X86::CMOV_V64I1:
23137 MachineBasicBlock *
23138 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
23139 MachineBasicBlock *BB) const {
23140 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23141 DebugLoc DL = MI.getDebugLoc();
23143 // To "insert" a SELECT_CC instruction, we actually have to insert the
23144 // diamond control-flow pattern. The incoming instruction knows the
23145 // destination vreg to set, the condition code register to branch on, the
23146 // true/false values to select between, and a branch opcode to use.
23147 const BasicBlock *LLVM_BB = BB->getBasicBlock();
23148 MachineFunction::iterator It = ++BB->getIterator();
23153 // cmpTY ccX, r1, r2
23155 // fallthrough --> copy0MBB
23156 MachineBasicBlock *thisMBB = BB;
23157 MachineFunction *F = BB->getParent();
23159 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
23160 // as described above, by inserting a BB, and then making a PHI at the join
23161 // point to select the true and false operands of the CMOV in the PHI.
23163 // The code also handles two different cases of multiple CMOV opcodes
23167 // In this case, there are multiple CMOVs in a row, all which are based on
23168 // the same condition setting (or the exact opposite condition setting).
23169 // In this case we can lower all the CMOVs using a single inserted BB, and
23170 // then make a number of PHIs at the join point to model the CMOVs. The only
23171 // trickiness here, is that in a case like:
23173 // t2 = CMOV cond1 t1, f1
23174 // t3 = CMOV cond1 t2, f2
23176 // when rewriting this into PHIs, we have to perform some renaming on the
23177 // temps since you cannot have a PHI operand refer to a PHI result earlier
23178 // in the same block. The "simple" but wrong lowering would be:
23180 // t2 = PHI t1(BB1), f1(BB2)
23181 // t3 = PHI t2(BB1), f2(BB2)
23183 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
23184 // renaming is to note that on the path through BB1, t2 is really just a
23185 // copy of t1, and do that renaming, properly generating:
23187 // t2 = PHI t1(BB1), f1(BB2)
23188 // t3 = PHI t1(BB1), f2(BB2)
23190 // Case 2, we lower cascaded CMOVs such as
23192 // (CMOV (CMOV F, T, cc1), T, cc2)
23194 // to two successives branches. For that, we look for another CMOV as the
23195 // following instruction.
23197 // Without this, we would add a PHI between the two jumps, which ends up
23198 // creating a few copies all around. For instance, for
23200 // (sitofp (zext (fcmp une)))
23202 // we would generate:
23204 // ucomiss %xmm1, %xmm0
23205 // movss <1.0f>, %xmm0
23206 // movaps %xmm0, %xmm1
23208 // xorps %xmm1, %xmm1
23211 // movaps %xmm1, %xmm0
23215 // because this custom-inserter would have generated:
23227 // A: X = ...; Y = ...
23229 // C: Z = PHI [X, A], [Y, B]
23231 // E: PHI [X, C], [Z, D]
23233 // If we lower both CMOVs in a single step, we can instead generate:
23245 // A: X = ...; Y = ...
23247 // E: PHI [X, A], [X, C], [Y, D]
23249 // Which, in our sitofp/fcmp example, gives us something like:
23251 // ucomiss %xmm1, %xmm0
23252 // movss <1.0f>, %xmm0
23255 // xorps %xmm0, %xmm0
23259 MachineInstr *CascadedCMOV = nullptr;
23260 MachineInstr *LastCMOV = &MI;
23261 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
23262 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
23263 MachineBasicBlock::iterator NextMIIt =
23264 std::next(MachineBasicBlock::iterator(MI));
23266 // Check for case 1, where there are multiple CMOVs with the same condition
23267 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
23268 // number of jumps the most.
23270 if (isCMOVPseudo(MI)) {
23271 // See if we have a string of CMOVS with the same condition.
23272 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
23273 (NextMIIt->getOperand(3).getImm() == CC ||
23274 NextMIIt->getOperand(3).getImm() == OppCC)) {
23275 LastCMOV = &*NextMIIt;
23280 // This checks for case 2, but only do this if we didn't already find
23281 // case 1, as indicated by LastCMOV == MI.
23282 if (LastCMOV == &MI && NextMIIt != BB->end() &&
23283 NextMIIt->getOpcode() == MI.getOpcode() &&
23284 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
23285 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
23286 NextMIIt->getOperand(1).isKill()) {
23287 CascadedCMOV = &*NextMIIt;
23290 MachineBasicBlock *jcc1MBB = nullptr;
23292 // If we have a cascaded CMOV, we lower it to two successive branches to
23293 // the same block. EFLAGS is used by both, so mark it as live in the second.
23294 if (CascadedCMOV) {
23295 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
23296 F->insert(It, jcc1MBB);
23297 jcc1MBB->addLiveIn(X86::EFLAGS);
23300 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
23301 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
23302 F->insert(It, copy0MBB);
23303 F->insert(It, sinkMBB);
23305 // If the EFLAGS register isn't dead in the terminator, then claim that it's
23306 // live into the sink and copy blocks.
23307 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
23309 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
23310 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
23311 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
23312 copy0MBB->addLiveIn(X86::EFLAGS);
23313 sinkMBB->addLiveIn(X86::EFLAGS);
23316 // Transfer the remainder of BB and its successor edges to sinkMBB.
23317 sinkMBB->splice(sinkMBB->begin(), BB,
23318 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
23319 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
23321 // Add the true and fallthrough blocks as its successors.
23322 if (CascadedCMOV) {
23323 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
23324 BB->addSuccessor(jcc1MBB);
23326 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
23327 // jump to the sinkMBB.
23328 jcc1MBB->addSuccessor(copy0MBB);
23329 jcc1MBB->addSuccessor(sinkMBB);
23331 BB->addSuccessor(copy0MBB);
23334 // The true block target of the first (or only) branch is always sinkMBB.
23335 BB->addSuccessor(sinkMBB);
23337 // Create the conditional branch instruction.
23338 unsigned Opc = X86::GetCondBranchFromCond(CC);
23339 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
23341 if (CascadedCMOV) {
23342 unsigned Opc2 = X86::GetCondBranchFromCond(
23343 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
23344 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
23348 // %FalseValue = ...
23349 // # fallthrough to sinkMBB
23350 copy0MBB->addSuccessor(sinkMBB);
23353 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
23355 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
23356 MachineBasicBlock::iterator MIItEnd =
23357 std::next(MachineBasicBlock::iterator(LastCMOV));
23358 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
23359 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
23360 MachineInstrBuilder MIB;
23362 // As we are creating the PHIs, we have to be careful if there is more than
23363 // one. Later CMOVs may reference the results of earlier CMOVs, but later
23364 // PHIs have to reference the individual true/false inputs from earlier PHIs.
23365 // That also means that PHI construction must work forward from earlier to
23366 // later, and that the code must maintain a mapping from earlier PHI's
23367 // destination registers, and the registers that went into the PHI.
23369 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
23370 unsigned DestReg = MIIt->getOperand(0).getReg();
23371 unsigned Op1Reg = MIIt->getOperand(1).getReg();
23372 unsigned Op2Reg = MIIt->getOperand(2).getReg();
23374 // If this CMOV we are generating is the opposite condition from
23375 // the jump we generated, then we have to swap the operands for the
23376 // PHI that is going to be generated.
23377 if (MIIt->getOperand(3).getImm() == OppCC)
23378 std::swap(Op1Reg, Op2Reg);
23380 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
23381 Op1Reg = RegRewriteTable[Op1Reg].first;
23383 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
23384 Op2Reg = RegRewriteTable[Op2Reg].second;
23386 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
23387 TII->get(X86::PHI), DestReg)
23388 .addReg(Op1Reg).addMBB(copy0MBB)
23389 .addReg(Op2Reg).addMBB(thisMBB);
23391 // Add this PHI to the rewrite table.
23392 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
23395 // If we have a cascaded CMOV, the second Jcc provides the same incoming
23396 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
23397 if (CascadedCMOV) {
23398 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
23399 // Copy the PHI result to the register defined by the second CMOV.
23400 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
23401 DL, TII->get(TargetOpcode::COPY),
23402 CascadedCMOV->getOperand(0).getReg())
23403 .addReg(MI.getOperand(0).getReg());
23404 CascadedCMOV->eraseFromParent();
23407 // Now remove the CMOV(s).
23408 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
23409 (MIIt++)->eraseFromParent();
23414 MachineBasicBlock *
23415 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
23416 MachineBasicBlock *BB) const {
23417 // Combine the following atomic floating-point modification pattern:
23418 // a.store(reg OP a.load(acquire), release)
23419 // Transform them into:
23420 // OPss (%gpr), %xmm
23421 // movss %xmm, (%gpr)
23422 // Or sd equivalent for 64-bit operations.
23424 switch (MI.getOpcode()) {
23425 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
23426 case X86::RELEASE_FADD32mr:
23427 FOp = X86::ADDSSrm;
23428 MOp = X86::MOVSSmr;
23430 case X86::RELEASE_FADD64mr:
23431 FOp = X86::ADDSDrm;
23432 MOp = X86::MOVSDmr;
23435 const X86InstrInfo *TII = Subtarget.getInstrInfo();
23436 DebugLoc DL = MI.getDebugLoc();
23437 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
23438 unsigned ValOpIdx = X86::AddrNumOperands;
23439 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
23440 MachineInstrBuilder MIB =
23441 BuildMI(*BB, MI, DL, TII->get(FOp),
23442 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
23444 for (int i = 0; i < X86::AddrNumOperands; ++i) {
23445 MachineOperand &Operand = MI.getOperand(i);
23446 // Clear any kill flags on register operands as we'll create a second
23447 // instruction using the same address operands.
23448 if (Operand.isReg())
23449 Operand.setIsKill(false);
23450 MIB.addOperand(Operand);
23452 MachineInstr *FOpMI = MIB;
23453 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
23454 for (int i = 0; i < X86::AddrNumOperands; ++i)
23455 MIB.addOperand(MI.getOperand(i));
23456 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
23457 MI.eraseFromParent(); // The pseudo instruction is gone now.
23461 MachineBasicBlock *
23462 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
23463 MachineBasicBlock *BB) const {
23464 MachineFunction *MF = BB->getParent();
23465 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23466 DebugLoc DL = MI.getDebugLoc();
23467 const BasicBlock *LLVM_BB = BB->getBasicBlock();
23469 assert(MF->shouldSplitStack());
23471 const bool Is64Bit = Subtarget.is64Bit();
23472 const bool IsLP64 = Subtarget.isTarget64BitLP64();
23474 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
23475 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
23478 // ... [Till the alloca]
23479 // If stacklet is not large enough, jump to mallocMBB
23482 // Allocate by subtracting from RSP
23483 // Jump to continueMBB
23486 // Allocate by call to runtime
23490 // [rest of original BB]
23493 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23494 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23495 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23497 MachineRegisterInfo &MRI = MF->getRegInfo();
23498 const TargetRegisterClass *AddrRegClass =
23499 getRegClassFor(getPointerTy(MF->getDataLayout()));
23501 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23502 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23503 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
23504 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
23505 sizeVReg = MI.getOperand(1).getReg(),
23507 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
23509 MachineFunction::iterator MBBIter = ++BB->getIterator();
23511 MF->insert(MBBIter, bumpMBB);
23512 MF->insert(MBBIter, mallocMBB);
23513 MF->insert(MBBIter, continueMBB);
23515 continueMBB->splice(continueMBB->begin(), BB,
23516 std::next(MachineBasicBlock::iterator(MI)), BB->end());
23517 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
23519 // Add code to the main basic block to check if the stack limit has been hit,
23520 // and if so, jump to mallocMBB otherwise to bumpMBB.
23521 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
23522 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
23523 .addReg(tmpSPVReg).addReg(sizeVReg);
23524 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
23525 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
23526 .addReg(SPLimitVReg);
23527 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
23529 // bumpMBB simply decreases the stack pointer, since we know the current
23530 // stacklet has enough space.
23531 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
23532 .addReg(SPLimitVReg);
23533 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
23534 .addReg(SPLimitVReg);
23535 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23537 // Calls into a routine in libgcc to allocate more space from the heap.
23538 const uint32_t *RegMask =
23539 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
23541 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
23543 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23544 .addExternalSymbol("__morestack_allocate_stack_space")
23545 .addRegMask(RegMask)
23546 .addReg(X86::RDI, RegState::Implicit)
23547 .addReg(X86::RAX, RegState::ImplicitDefine);
23548 } else if (Is64Bit) {
23549 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
23551 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23552 .addExternalSymbol("__morestack_allocate_stack_space")
23553 .addRegMask(RegMask)
23554 .addReg(X86::EDI, RegState::Implicit)
23555 .addReg(X86::EAX, RegState::ImplicitDefine);
23557 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
23559 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
23560 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
23561 .addExternalSymbol("__morestack_allocate_stack_space")
23562 .addRegMask(RegMask)
23563 .addReg(X86::EAX, RegState::ImplicitDefine);
23567 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
23570 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
23571 .addReg(IsLP64 ? X86::RAX : X86::EAX);
23572 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23574 // Set up the CFG correctly.
23575 BB->addSuccessor(bumpMBB);
23576 BB->addSuccessor(mallocMBB);
23577 mallocMBB->addSuccessor(continueMBB);
23578 bumpMBB->addSuccessor(continueMBB);
23580 // Take care of the PHI nodes.
23581 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
23582 MI.getOperand(0).getReg())
23583 .addReg(mallocPtrVReg)
23585 .addReg(bumpSPPtrVReg)
23588 // Delete the original pseudo instruction.
23589 MI.eraseFromParent();
23592 return continueMBB;
23595 MachineBasicBlock *
23596 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
23597 MachineBasicBlock *BB) const {
23598 MachineFunction *MF = BB->getParent();
23599 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23600 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
23601 DebugLoc DL = MI.getDebugLoc();
23603 assert(!isAsynchronousEHPersonality(
23604 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
23605 "SEH does not use catchret!");
23607 // Only 32-bit EH needs to worry about manually restoring stack pointers.
23608 if (!Subtarget.is32Bit())
23611 // C++ EH creates a new target block to hold the restore code, and wires up
23612 // the new block to the return destination with a normal JMP_4.
23613 MachineBasicBlock *RestoreMBB =
23614 MF->CreateMachineBasicBlock(BB->getBasicBlock());
23615 assert(BB->succ_size() == 1);
23616 MF->insert(std::next(BB->getIterator()), RestoreMBB);
23617 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
23618 BB->addSuccessor(RestoreMBB);
23619 MI.getOperand(0).setMBB(RestoreMBB);
23621 auto RestoreMBBI = RestoreMBB->begin();
23622 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
23623 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
23627 MachineBasicBlock *
23628 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
23629 MachineBasicBlock *BB) const {
23630 MachineFunction *MF = BB->getParent();
23631 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
23632 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
23633 // Only 32-bit SEH requires special handling for catchpad.
23634 if (IsSEH && Subtarget.is32Bit()) {
23635 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23636 DebugLoc DL = MI.getDebugLoc();
23637 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
23639 MI.eraseFromParent();
23643 MachineBasicBlock *
23644 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
23645 MachineBasicBlock *BB) const {
23646 // So, here we replace TLSADDR with the sequence:
23647 // adjust_stackdown -> TLSADDR -> adjust_stackup.
23648 // We need this because TLSADDR is lowered into calls
23649 // inside MC, therefore without the two markers shrink-wrapping
23650 // may push the prologue/epilogue pass them.
23651 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23652 DebugLoc DL = MI.getDebugLoc();
23653 MachineFunction &MF = *BB->getParent();
23655 // Emit CALLSEQ_START right before the instruction.
23656 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
23657 MachineInstrBuilder CallseqStart =
23658 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
23659 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
23661 // Emit CALLSEQ_END right after the instruction.
23662 // We don't call erase from parent because we want to keep the
23663 // original instruction around.
23664 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
23665 MachineInstrBuilder CallseqEnd =
23666 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
23667 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
23672 MachineBasicBlock *
23673 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
23674 MachineBasicBlock *BB) const {
23675 // This is pretty easy. We're taking the value that we received from
23676 // our load from the relocation, sticking it in either RDI (x86-64)
23677 // or EAX and doing an indirect call. The return value will then
23678 // be in the normal return register.
23679 MachineFunction *F = BB->getParent();
23680 const X86InstrInfo *TII = Subtarget.getInstrInfo();
23681 DebugLoc DL = MI.getDebugLoc();
23683 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
23684 assert(MI.getOperand(3).isGlobal() && "This should be a global");
23686 // Get a register mask for the lowered call.
23687 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
23688 // proper register mask.
23689 const uint32_t *RegMask =
23690 Subtarget.is64Bit() ?
23691 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
23692 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
23693 if (Subtarget.is64Bit()) {
23694 MachineInstrBuilder MIB =
23695 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
23699 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23700 MI.getOperand(3).getTargetFlags())
23702 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
23703 addDirectMem(MIB, X86::RDI);
23704 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
23705 } else if (!isPositionIndependent()) {
23706 MachineInstrBuilder MIB =
23707 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23711 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23712 MI.getOperand(3).getTargetFlags())
23714 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23715 addDirectMem(MIB, X86::EAX);
23716 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23718 MachineInstrBuilder MIB =
23719 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23720 .addReg(TII->getGlobalBaseReg(F))
23723 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23724 MI.getOperand(3).getTargetFlags())
23726 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23727 addDirectMem(MIB, X86::EAX);
23728 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23731 MI.eraseFromParent(); // The pseudo instruction is gone now.
23735 MachineBasicBlock *
23736 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
23737 MachineBasicBlock *MBB) const {
23738 DebugLoc DL = MI.getDebugLoc();
23739 MachineFunction *MF = MBB->getParent();
23740 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23741 MachineRegisterInfo &MRI = MF->getRegInfo();
23743 const BasicBlock *BB = MBB->getBasicBlock();
23744 MachineFunction::iterator I = ++MBB->getIterator();
23746 // Memory Reference
23747 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23748 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23751 unsigned MemOpndSlot = 0;
23753 unsigned CurOp = 0;
23755 DstReg = MI.getOperand(CurOp++).getReg();
23756 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
23757 assert(RC->hasType(MVT::i32) && "Invalid destination!");
23758 unsigned mainDstReg = MRI.createVirtualRegister(RC);
23759 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
23761 MemOpndSlot = CurOp;
23763 MVT PVT = getPointerTy(MF->getDataLayout());
23764 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23765 "Invalid Pointer Size!");
23767 // For v = setjmp(buf), we generate
23770 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
23771 // SjLjSetup restoreMBB
23777 // v = phi(main, restore)
23780 // if base pointer being used, load it from frame
23783 MachineBasicBlock *thisMBB = MBB;
23784 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23785 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23786 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
23787 MF->insert(I, mainMBB);
23788 MF->insert(I, sinkMBB);
23789 MF->push_back(restoreMBB);
23790 restoreMBB->setHasAddressTaken();
23792 MachineInstrBuilder MIB;
23794 // Transfer the remainder of BB and its successor edges to sinkMBB.
23795 sinkMBB->splice(sinkMBB->begin(), MBB,
23796 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23797 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23800 unsigned PtrStoreOpc = 0;
23801 unsigned LabelReg = 0;
23802 const int64_t LabelOffset = 1 * PVT.getStoreSize();
23803 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23804 !isPositionIndependent();
23806 // Prepare IP either in reg or imm.
23807 if (!UseImmLabel) {
23808 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23809 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
23810 LabelReg = MRI.createVirtualRegister(PtrRC);
23811 if (Subtarget.is64Bit()) {
23812 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
23816 .addMBB(restoreMBB)
23819 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
23820 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
23821 .addReg(XII->getGlobalBaseReg(MF))
23824 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
23828 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23830 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
23831 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23832 if (i == X86::AddrDisp)
23833 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
23835 MIB.addOperand(MI.getOperand(MemOpndSlot + i));
23838 MIB.addReg(LabelReg);
23840 MIB.addMBB(restoreMBB);
23841 MIB.setMemRefs(MMOBegin, MMOEnd);
23843 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
23844 .addMBB(restoreMBB);
23846 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23847 MIB.addRegMask(RegInfo->getNoPreservedMask());
23848 thisMBB->addSuccessor(mainMBB);
23849 thisMBB->addSuccessor(restoreMBB);
23853 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
23854 mainMBB->addSuccessor(sinkMBB);
23857 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
23858 TII->get(X86::PHI), DstReg)
23859 .addReg(mainDstReg).addMBB(mainMBB)
23860 .addReg(restoreDstReg).addMBB(restoreMBB);
23863 if (RegInfo->hasBasePointer(*MF)) {
23864 const bool Uses64BitFramePtr =
23865 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
23866 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
23867 X86FI->setRestoreBasePointer(MF);
23868 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
23869 unsigned BasePtr = RegInfo->getBaseRegister();
23870 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
23871 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
23872 FramePtr, true, X86FI->getRestoreBasePointerOffset())
23873 .setMIFlag(MachineInstr::FrameSetup);
23875 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
23876 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
23877 restoreMBB->addSuccessor(sinkMBB);
23879 MI.eraseFromParent();
23883 MachineBasicBlock *
23884 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
23885 MachineBasicBlock *MBB) const {
23886 DebugLoc DL = MI.getDebugLoc();
23887 MachineFunction *MF = MBB->getParent();
23888 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23889 MachineRegisterInfo &MRI = MF->getRegInfo();
23891 // Memory Reference
23892 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23893 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23895 MVT PVT = getPointerTy(MF->getDataLayout());
23896 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23897 "Invalid Pointer Size!");
23899 const TargetRegisterClass *RC =
23900 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23901 unsigned Tmp = MRI.createVirtualRegister(RC);
23902 // Since FP is only updated here but NOT referenced, it's treated as GPR.
23903 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23904 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
23905 unsigned SP = RegInfo->getStackRegister();
23907 MachineInstrBuilder MIB;
23909 const int64_t LabelOffset = 1 * PVT.getStoreSize();
23910 const int64_t SPOffset = 2 * PVT.getStoreSize();
23912 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
23913 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
23916 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
23917 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
23918 MIB.addOperand(MI.getOperand(i));
23919 MIB.setMemRefs(MMOBegin, MMOEnd);
23921 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
23922 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23923 if (i == X86::AddrDisp)
23924 MIB.addDisp(MI.getOperand(i), LabelOffset);
23926 MIB.addOperand(MI.getOperand(i));
23928 MIB.setMemRefs(MMOBegin, MMOEnd);
23930 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
23931 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23932 if (i == X86::AddrDisp)
23933 MIB.addDisp(MI.getOperand(i), SPOffset);
23935 MIB.addOperand(MI.getOperand(i));
23937 MIB.setMemRefs(MMOBegin, MMOEnd);
23939 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
23941 MI.eraseFromParent();
23945 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
23946 MachineBasicBlock *MBB,
23947 MachineBasicBlock *DispatchBB,
23949 DebugLoc DL = MI.getDebugLoc();
23950 MachineFunction *MF = MBB->getParent();
23951 MachineRegisterInfo *MRI = &MF->getRegInfo();
23952 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23954 MVT PVT = getPointerTy(MF->getDataLayout());
23955 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
23960 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23961 !isPositionIndependent();
23964 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23966 const TargetRegisterClass *TRC =
23967 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23968 VR = MRI->createVirtualRegister(TRC);
23969 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23971 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
23973 if (Subtarget.is64Bit())
23974 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
23978 .addMBB(DispatchBB)
23981 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
23982 .addReg(0) /* XII->getGlobalBaseReg(MF) */
23985 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
23989 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
23990 addFrameReference(MIB, FI, 36);
23992 MIB.addMBB(DispatchBB);
23997 MachineBasicBlock *
23998 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
23999 MachineBasicBlock *BB) const {
24000 DebugLoc DL = MI.getDebugLoc();
24001 MachineFunction *MF = BB->getParent();
24002 MachineModuleInfo *MMI = &MF->getMMI();
24003 MachineFrameInfo *MFI = MF->getFrameInfo();
24004 MachineRegisterInfo *MRI = &MF->getRegInfo();
24005 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24006 int FI = MFI->getFunctionContextIndex();
24008 // Get a mapping of the call site numbers to all of the landing pads they're
24009 // associated with.
24010 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
24011 unsigned MaxCSNum = 0;
24012 for (auto &MBB : *MF) {
24013 if (!MBB.isEHPad())
24016 MCSymbol *Sym = nullptr;
24017 for (const auto &MI : MBB) {
24018 if (MI.isDebugValue())
24021 assert(MI.isEHLabel() && "expected EH_LABEL");
24022 Sym = MI.getOperand(0).getMCSymbol();
24026 if (!MMI->hasCallSiteLandingPad(Sym))
24029 for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
24030 CallSiteNumToLPad[CSI].push_back(&MBB);
24031 MaxCSNum = std::max(MaxCSNum, CSI);
24035 // Get an ordered list of the machine basic blocks for the jump table.
24036 std::vector<MachineBasicBlock *> LPadList;
24037 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
24038 LPadList.reserve(CallSiteNumToLPad.size());
24040 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
24041 for (auto &LP : CallSiteNumToLPad[CSI]) {
24042 LPadList.push_back(LP);
24043 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
24047 assert(!LPadList.empty() &&
24048 "No landing pad destinations for the dispatch jump table!");
24050 // Create the MBBs for the dispatch code.
24052 // Shove the dispatch's address into the return slot in the function context.
24053 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
24054 DispatchBB->setIsEHPad(true);
24056 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
24057 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
24058 DispatchBB->addSuccessor(TrapBB);
24060 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
24061 DispatchBB->addSuccessor(DispContBB);
24064 MF->push_back(DispatchBB);
24065 MF->push_back(DispContBB);
24066 MF->push_back(TrapBB);
24068 // Insert code into the entry block that creates and registers the function
24070 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
24072 // Create the jump table and associated information
24073 MachineJumpTableInfo *JTI =
24074 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
24075 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
24077 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
24078 const X86RegisterInfo &RI = XII->getRegisterInfo();
24080 // Add a register mask with no preserved registers. This results in all
24081 // registers being marked as clobbered.
24082 if (RI.hasBasePointer(*MF)) {
24083 const bool FPIs64Bit =
24084 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
24085 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
24086 MFI->setRestoreBasePointer(MF);
24088 unsigned FP = RI.getFrameRegister(*MF);
24089 unsigned BP = RI.getBaseRegister();
24090 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
24091 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
24092 MFI->getRestoreBasePointerOffset())
24093 .addRegMask(RI.getNoPreservedMask());
24095 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
24096 .addRegMask(RI.getNoPreservedMask());
24099 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24100 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
24102 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
24104 .addImm(LPadList.size());
24105 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
24107 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24108 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
24111 BuildMI(DispContBB, DL,
24112 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
24114 .addImm(Subtarget.is64Bit() ? 8 : 4)
24116 .addJumpTableIndex(MJTI)
24119 // Add the jump table entries as successors to the MBB.
24120 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
24121 for (auto &LP : LPadList)
24122 if (SeenMBBs.insert(LP).second)
24123 DispContBB->addSuccessor(LP);
24125 // N.B. the order the invoke BBs are processed in doesn't matter here.
24126 SmallVector<MachineBasicBlock *, 64> MBBLPads;
24127 const MCPhysReg *SavedRegs =
24128 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
24129 for (MachineBasicBlock *MBB : InvokeBBs) {
24130 // Remove the landing pad successor from the invoke block and replace it
24131 // with the new dispatch block.
24132 // Keep a copy of Successors since it's modified inside the loop.
24133 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
24135 // FIXME: Avoid quadratic complexity.
24136 for (auto MBBS : Successors) {
24137 if (MBBS->isEHPad()) {
24138 MBB->removeSuccessor(MBBS);
24139 MBBLPads.push_back(MBBS);
24143 MBB->addSuccessor(DispatchBB);
24145 // Find the invoke call and mark all of the callee-saved registers as
24146 // 'implicit defined' so that they're spilled. This prevents code from
24147 // moving instructions to before the EH block, where they will never be
24149 for (auto &II : reverse(*MBB)) {
24153 DenseMap<unsigned, bool> DefRegs;
24154 for (auto &MOp : II.operands())
24156 DefRegs[MOp.getReg()] = true;
24158 MachineInstrBuilder MIB(*MF, &II);
24159 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
24160 unsigned Reg = SavedRegs[RI];
24162 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
24169 // Mark all former landing pads as non-landing pads. The dispatch is the only
24170 // landing pad now.
24171 for (auto &LP : MBBLPads)
24172 LP->setIsEHPad(false);
24174 // The instruction is gone now.
24175 MI.eraseFromParent();
24179 // Replace 213-type (isel default) FMA3 instructions with 231-type for
24180 // accumulator loops. Writing back to the accumulator allows the coalescer
24181 // to remove extra copies in the loop.
24182 // FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937).
24183 MachineBasicBlock *
24184 X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
24185 MachineBasicBlock *MBB) const {
24186 MachineOperand &AddendOp = MI.getOperand(3);
24188 // Bail out early if the addend isn't a register - we can't switch these.
24189 if (!AddendOp.isReg())
24192 MachineFunction &MF = *MBB->getParent();
24193 MachineRegisterInfo &MRI = MF.getRegInfo();
24195 // Check whether the addend is defined by a PHI:
24196 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
24197 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
24198 if (!AddendDef.isPHI())
24201 // Look for the following pattern:
24203 // %addend = phi [%entry, 0], [%loop, %result]
24205 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
24209 // %addend = phi [%entry, 0], [%loop, %result]
24211 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
24213 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
24214 assert(AddendDef.getOperand(i).isReg());
24215 MachineOperand PHISrcOp = AddendDef.getOperand(i);
24216 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
24217 if (&PHISrcInst == &MI) {
24218 // Found a matching instruction.
24219 unsigned NewFMAOpc = 0;
24220 switch (MI.getOpcode()) {
24221 case X86::VFMADDPDr213r:
24222 NewFMAOpc = X86::VFMADDPDr231r;
24224 case X86::VFMADDPSr213r:
24225 NewFMAOpc = X86::VFMADDPSr231r;
24227 case X86::VFMADDSDr213r:
24228 NewFMAOpc = X86::VFMADDSDr231r;
24230 case X86::VFMADDSSr213r:
24231 NewFMAOpc = X86::VFMADDSSr231r;
24233 case X86::VFMSUBPDr213r:
24234 NewFMAOpc = X86::VFMSUBPDr231r;
24236 case X86::VFMSUBPSr213r:
24237 NewFMAOpc = X86::VFMSUBPSr231r;
24239 case X86::VFMSUBSDr213r:
24240 NewFMAOpc = X86::VFMSUBSDr231r;
24242 case X86::VFMSUBSSr213r:
24243 NewFMAOpc = X86::VFMSUBSSr231r;
24245 case X86::VFNMADDPDr213r:
24246 NewFMAOpc = X86::VFNMADDPDr231r;
24248 case X86::VFNMADDPSr213r:
24249 NewFMAOpc = X86::VFNMADDPSr231r;
24251 case X86::VFNMADDSDr213r:
24252 NewFMAOpc = X86::VFNMADDSDr231r;
24254 case X86::VFNMADDSSr213r:
24255 NewFMAOpc = X86::VFNMADDSSr231r;
24257 case X86::VFNMSUBPDr213r:
24258 NewFMAOpc = X86::VFNMSUBPDr231r;
24260 case X86::VFNMSUBPSr213r:
24261 NewFMAOpc = X86::VFNMSUBPSr231r;
24263 case X86::VFNMSUBSDr213r:
24264 NewFMAOpc = X86::VFNMSUBSDr231r;
24266 case X86::VFNMSUBSSr213r:
24267 NewFMAOpc = X86::VFNMSUBSSr231r;
24269 case X86::VFMADDSUBPDr213r:
24270 NewFMAOpc = X86::VFMADDSUBPDr231r;
24272 case X86::VFMADDSUBPSr213r:
24273 NewFMAOpc = X86::VFMADDSUBPSr231r;
24275 case X86::VFMSUBADDPDr213r:
24276 NewFMAOpc = X86::VFMSUBADDPDr231r;
24278 case X86::VFMSUBADDPSr213r:
24279 NewFMAOpc = X86::VFMSUBADDPSr231r;
24282 case X86::VFMADDPDr213rY:
24283 NewFMAOpc = X86::VFMADDPDr231rY;
24285 case X86::VFMADDPSr213rY:
24286 NewFMAOpc = X86::VFMADDPSr231rY;
24288 case X86::VFMSUBPDr213rY:
24289 NewFMAOpc = X86::VFMSUBPDr231rY;
24291 case X86::VFMSUBPSr213rY:
24292 NewFMAOpc = X86::VFMSUBPSr231rY;
24294 case X86::VFNMADDPDr213rY:
24295 NewFMAOpc = X86::VFNMADDPDr231rY;
24297 case X86::VFNMADDPSr213rY:
24298 NewFMAOpc = X86::VFNMADDPSr231rY;
24300 case X86::VFNMSUBPDr213rY:
24301 NewFMAOpc = X86::VFNMSUBPDr231rY;
24303 case X86::VFNMSUBPSr213rY:
24304 NewFMAOpc = X86::VFNMSUBPSr231rY;
24306 case X86::VFMADDSUBPDr213rY:
24307 NewFMAOpc = X86::VFMADDSUBPDr231rY;
24309 case X86::VFMADDSUBPSr213rY:
24310 NewFMAOpc = X86::VFMADDSUBPSr231rY;
24312 case X86::VFMSUBADDPDr213rY:
24313 NewFMAOpc = X86::VFMSUBADDPDr231rY;
24315 case X86::VFMSUBADDPSr213rY:
24316 NewFMAOpc = X86::VFMSUBADDPSr231rY;
24319 llvm_unreachable("Unrecognized FMA variant.");
24322 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
24323 MachineInstrBuilder MIB =
24324 BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
24325 .addOperand(MI.getOperand(0))
24326 .addOperand(MI.getOperand(3))
24327 .addOperand(MI.getOperand(2))
24328 .addOperand(MI.getOperand(1));
24329 MBB->insert(MachineBasicBlock::iterator(MI), MIB);
24330 MI.eraseFromParent();
24337 MachineBasicBlock *
24338 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
24339 MachineBasicBlock *BB) const {
24340 switch (MI.getOpcode()) {
24341 default: llvm_unreachable("Unexpected instr type to insert");
24342 case X86::TAILJMPd64:
24343 case X86::TAILJMPr64:
24344 case X86::TAILJMPm64:
24345 case X86::TAILJMPd64_REX:
24346 case X86::TAILJMPr64_REX:
24347 case X86::TAILJMPm64_REX:
24348 llvm_unreachable("TAILJMP64 would not be touched here.");
24349 case X86::TCRETURNdi64:
24350 case X86::TCRETURNri64:
24351 case X86::TCRETURNmi64:
24353 case X86::TLS_addr32:
24354 case X86::TLS_addr64:
24355 case X86::TLS_base_addr32:
24356 case X86::TLS_base_addr64:
24357 return EmitLoweredTLSAddr(MI, BB);
24358 case X86::CATCHRET:
24359 return EmitLoweredCatchRet(MI, BB);
24360 case X86::CATCHPAD:
24361 return EmitLoweredCatchPad(MI, BB);
24362 case X86::SEG_ALLOCA_32:
24363 case X86::SEG_ALLOCA_64:
24364 return EmitLoweredSegAlloca(MI, BB);
24365 case X86::TLSCall_32:
24366 case X86::TLSCall_64:
24367 return EmitLoweredTLSCall(MI, BB);
24368 case X86::CMOV_FR32:
24369 case X86::CMOV_FR64:
24370 case X86::CMOV_FR128:
24371 case X86::CMOV_GR8:
24372 case X86::CMOV_GR16:
24373 case X86::CMOV_GR32:
24374 case X86::CMOV_RFP32:
24375 case X86::CMOV_RFP64:
24376 case X86::CMOV_RFP80:
24377 case X86::CMOV_V2F64:
24378 case X86::CMOV_V2I64:
24379 case X86::CMOV_V4F32:
24380 case X86::CMOV_V4F64:
24381 case X86::CMOV_V4I64:
24382 case X86::CMOV_V16F32:
24383 case X86::CMOV_V8F32:
24384 case X86::CMOV_V8F64:
24385 case X86::CMOV_V8I64:
24386 case X86::CMOV_V8I1:
24387 case X86::CMOV_V16I1:
24388 case X86::CMOV_V32I1:
24389 case X86::CMOV_V64I1:
24390 return EmitLoweredSelect(MI, BB);
24392 case X86::RDFLAGS32:
24393 case X86::RDFLAGS64: {
24394 DebugLoc DL = MI.getDebugLoc();
24395 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24397 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
24398 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
24399 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
24400 // Permit reads of the FLAGS register without it being defined.
24401 // This intrinsic exists to read external processor state in flags, such as
24402 // the trap flag, interrupt flag, and direction flag, none of which are
24403 // modeled by the backend.
24404 Push->getOperand(2).setIsUndef();
24405 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
24407 MI.eraseFromParent(); // The pseudo is gone now.
24411 case X86::WRFLAGS32:
24412 case X86::WRFLAGS64: {
24413 DebugLoc DL = MI.getDebugLoc();
24414 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24416 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
24418 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
24419 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
24420 BuildMI(*BB, MI, DL, TII->get(PopF));
24422 MI.eraseFromParent(); // The pseudo is gone now.
24426 case X86::RELEASE_FADD32mr:
24427 case X86::RELEASE_FADD64mr:
24428 return EmitLoweredAtomicFP(MI, BB);
24430 case X86::FP32_TO_INT16_IN_MEM:
24431 case X86::FP32_TO_INT32_IN_MEM:
24432 case X86::FP32_TO_INT64_IN_MEM:
24433 case X86::FP64_TO_INT16_IN_MEM:
24434 case X86::FP64_TO_INT32_IN_MEM:
24435 case X86::FP64_TO_INT64_IN_MEM:
24436 case X86::FP80_TO_INT16_IN_MEM:
24437 case X86::FP80_TO_INT32_IN_MEM:
24438 case X86::FP80_TO_INT64_IN_MEM: {
24439 MachineFunction *F = BB->getParent();
24440 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24441 DebugLoc DL = MI.getDebugLoc();
24443 // Change the floating point control register to use "round towards zero"
24444 // mode when truncating to an integer value.
24445 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
24446 addFrameReference(BuildMI(*BB, MI, DL,
24447 TII->get(X86::FNSTCW16m)), CWFrameIdx);
24449 // Load the old value of the high byte of the control word...
24451 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
24452 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
24455 // Set the high part to be round to zero...
24456 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
24459 // Reload the modified control word now...
24460 addFrameReference(BuildMI(*BB, MI, DL,
24461 TII->get(X86::FLDCW16m)), CWFrameIdx);
24463 // Restore the memory image of control word to original value
24464 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
24467 // Get the X86 opcode to use.
24469 switch (MI.getOpcode()) {
24470 default: llvm_unreachable("illegal opcode!");
24471 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
24472 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
24473 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
24474 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
24475 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
24476 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
24477 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
24478 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
24479 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
24482 X86AddressMode AM = getAddressFromInstr(&MI, 0);
24483 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
24484 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
24486 // Reload the original control word now.
24487 addFrameReference(BuildMI(*BB, MI, DL,
24488 TII->get(X86::FLDCW16m)), CWFrameIdx);
24490 MI.eraseFromParent(); // The pseudo instruction is gone now.
24493 // String/text processing lowering.
24494 case X86::PCMPISTRM128REG:
24495 case X86::VPCMPISTRM128REG:
24496 case X86::PCMPISTRM128MEM:
24497 case X86::VPCMPISTRM128MEM:
24498 case X86::PCMPESTRM128REG:
24499 case X86::VPCMPESTRM128REG:
24500 case X86::PCMPESTRM128MEM:
24501 case X86::VPCMPESTRM128MEM:
24502 assert(Subtarget.hasSSE42() &&
24503 "Target must have SSE4.2 or AVX features enabled");
24504 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
24506 // String/text processing lowering.
24507 case X86::PCMPISTRIREG:
24508 case X86::VPCMPISTRIREG:
24509 case X86::PCMPISTRIMEM:
24510 case X86::VPCMPISTRIMEM:
24511 case X86::PCMPESTRIREG:
24512 case X86::VPCMPESTRIREG:
24513 case X86::PCMPESTRIMEM:
24514 case X86::VPCMPESTRIMEM:
24515 assert(Subtarget.hasSSE42() &&
24516 "Target must have SSE4.2 or AVX features enabled");
24517 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
24519 // Thread synchronization.
24521 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
24522 case X86::MONITORX:
24523 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
24526 return emitWRPKRU(MI, BB, Subtarget);
24528 return emitRDPKRU(MI, BB, Subtarget);
24531 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
24533 case X86::VASTART_SAVE_XMM_REGS:
24534 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
24536 case X86::VAARG_64:
24537 return EmitVAARG64WithCustomInserter(MI, BB);
24539 case X86::EH_SjLj_SetJmp32:
24540 case X86::EH_SjLj_SetJmp64:
24541 return emitEHSjLjSetJmp(MI, BB);
24543 case X86::EH_SjLj_LongJmp32:
24544 case X86::EH_SjLj_LongJmp64:
24545 return emitEHSjLjLongJmp(MI, BB);
24547 case X86::Int_eh_sjlj_setup_dispatch:
24548 return EmitSjLjDispatchBlock(MI, BB);
24550 case TargetOpcode::STATEPOINT:
24551 // As an implementation detail, STATEPOINT shares the STACKMAP format at
24552 // this point in the process. We diverge later.
24553 return emitPatchPoint(MI, BB);
24555 case TargetOpcode::STACKMAP:
24556 case TargetOpcode::PATCHPOINT:
24557 return emitPatchPoint(MI, BB);
24559 case X86::VFMADDPDr213r:
24560 case X86::VFMADDPSr213r:
24561 case X86::VFMADDSDr213r:
24562 case X86::VFMADDSSr213r:
24563 case X86::VFMSUBPDr213r:
24564 case X86::VFMSUBPSr213r:
24565 case X86::VFMSUBSDr213r:
24566 case X86::VFMSUBSSr213r:
24567 case X86::VFNMADDPDr213r:
24568 case X86::VFNMADDPSr213r:
24569 case X86::VFNMADDSDr213r:
24570 case X86::VFNMADDSSr213r:
24571 case X86::VFNMSUBPDr213r:
24572 case X86::VFNMSUBPSr213r:
24573 case X86::VFNMSUBSDr213r:
24574 case X86::VFNMSUBSSr213r:
24575 case X86::VFMADDSUBPDr213r:
24576 case X86::VFMADDSUBPSr213r:
24577 case X86::VFMSUBADDPDr213r:
24578 case X86::VFMSUBADDPSr213r:
24579 case X86::VFMADDPDr213rY:
24580 case X86::VFMADDPSr213rY:
24581 case X86::VFMSUBPDr213rY:
24582 case X86::VFMSUBPSr213rY:
24583 case X86::VFNMADDPDr213rY:
24584 case X86::VFNMADDPSr213rY:
24585 case X86::VFNMSUBPDr213rY:
24586 case X86::VFNMSUBPSr213rY:
24587 case X86::VFMADDSUBPDr213rY:
24588 case X86::VFMADDSUBPSr213rY:
24589 case X86::VFMSUBADDPDr213rY:
24590 case X86::VFMSUBADDPSr213rY:
24591 return emitFMA3Instr(MI, BB);
24592 case X86::LCMPXCHG8B_SAVE_EBX:
24593 case X86::LCMPXCHG16B_SAVE_RBX: {
24595 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
24596 if (!BB->isLiveIn(BasePtr))
24597 BB->addLiveIn(BasePtr);
24603 //===----------------------------------------------------------------------===//
24604 // X86 Optimization Hooks
24605 //===----------------------------------------------------------------------===//
24607 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
24610 const SelectionDAG &DAG,
24611 unsigned Depth) const {
24612 unsigned BitWidth = KnownZero.getBitWidth();
24613 unsigned Opc = Op.getOpcode();
24614 assert((Opc >= ISD::BUILTIN_OP_END ||
24615 Opc == ISD::INTRINSIC_WO_CHAIN ||
24616 Opc == ISD::INTRINSIC_W_CHAIN ||
24617 Opc == ISD::INTRINSIC_VOID) &&
24618 "Should use MaskedValueIsZero if you don't know whether Op"
24619 " is a target node!");
24621 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
24635 // These nodes' second result is a boolean.
24636 if (Op.getResNo() == 0)
24639 case X86ISD::SETCC:
24640 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
24642 case X86ISD::MOVMSK: {
24643 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
24644 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
24650 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
24652 const SelectionDAG &,
24653 unsigned Depth) const {
24654 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
24655 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
24656 return Op.getValueType().getScalarSizeInBits();
24662 /// Returns true (and the GlobalValue and the offset) if the node is a
24663 /// GlobalAddress + offset.
24664 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
24665 const GlobalValue* &GA,
24666 int64_t &Offset) const {
24667 if (N->getOpcode() == X86ISD::Wrapper) {
24668 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
24669 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
24670 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
24674 return TargetLowering::isGAPlusOffset(N, GA, Offset);
24677 /// Performs shuffle combines for 256-bit vectors.
24678 /// FIXME: This could be expanded to support 512 bit vectors as well.
24679 static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
24680 TargetLowering::DAGCombinerInfo &DCI,
24681 const X86Subtarget &Subtarget) {
24683 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
24684 SDValue V1 = SVOp->getOperand(0);
24685 SDValue V2 = SVOp->getOperand(1);
24686 MVT VT = SVOp->getSimpleValueType(0);
24687 unsigned NumElems = VT.getVectorNumElements();
24689 if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
24690 V2.getOpcode() == ISD::CONCAT_VECTORS) {
24694 // V UNDEF BUILD_VECTOR UNDEF
24696 // CONCAT_VECTOR CONCAT_VECTOR
24699 // RESULT: V + zero extended
24701 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
24702 !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
24705 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
24708 // To match the shuffle mask, the first half of the mask should
24709 // be exactly the first vector, and all the rest a splat with the
24710 // first element of the second one.
24711 for (unsigned i = 0; i != NumElems/2; ++i)
24712 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
24713 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
24716 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
24717 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
24718 if (Ld->hasNUsesOfValue(1, 0)) {
24719 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
24720 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
24722 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
24724 Ld->getPointerInfo(),
24725 Ld->getAlignment(),
24726 false/*isVolatile*/, true/*ReadMem*/,
24727 false/*WriteMem*/);
24729 // Make sure the newly-created LOAD is in the same position as Ld in
24730 // terms of dependency. We create a TokenFactor for Ld and ResNode,
24731 // and update uses of Ld's output chain to use the TokenFactor.
24732 if (Ld->hasAnyUseOfValue(1)) {
24733 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24734 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
24735 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
24736 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
24737 SDValue(ResNode.getNode(), 1));
24740 return DAG.getBitcast(VT, ResNode);
24744 // Emit a zeroed vector and insert the desired subvector on its
24746 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
24747 SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
24748 return DCI.CombineTo(N, InsV);
24754 // Attempt to match a combined shuffle mask against supported unary shuffle
24756 // TODO: Investigate sharing more of this with shuffle lowering.
24757 static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24758 const X86Subtarget &Subtarget,
24759 unsigned &Shuffle, MVT &ShuffleVT) {
24760 bool FloatDomain = SrcVT.isFloatingPoint() ||
24761 (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
24763 // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
24764 if (!FloatDomain && SrcVT.is128BitVector() &&
24765 isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
24766 Shuffle = X86ISD::VZEXT_MOVL;
24767 ShuffleVT = MVT::v2i64;
24771 // Check if we have SSE3 which will let us use MOVDDUP etc. The
24772 // instructions are no slower than UNPCKLPD but has the option to
24773 // fold the input operand into even an unaligned memory load.
24774 if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
24775 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
24776 Shuffle = X86ISD::MOVDDUP;
24777 ShuffleVT = MVT::v2f64;
24780 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24781 Shuffle = X86ISD::MOVSLDUP;
24782 ShuffleVT = MVT::v4f32;
24785 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
24786 Shuffle = X86ISD::MOVSHDUP;
24787 ShuffleVT = MVT::v4f32;
24792 if (SrcVT.is256BitVector() && FloatDomain) {
24793 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
24794 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24795 Shuffle = X86ISD::MOVDDUP;
24796 ShuffleVT = MVT::v4f64;
24799 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24800 Shuffle = X86ISD::MOVSLDUP;
24801 ShuffleVT = MVT::v8f32;
24804 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
24805 Shuffle = X86ISD::MOVSHDUP;
24806 ShuffleVT = MVT::v8f32;
24811 if (SrcVT.is512BitVector() && FloatDomain) {
24812 assert(Subtarget.hasAVX512() &&
24813 "AVX512 required for 512-bit vector shuffles");
24814 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24815 Shuffle = X86ISD::MOVDDUP;
24816 ShuffleVT = MVT::v8f64;
24819 if (isTargetShuffleEquivalent(
24820 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
24821 Shuffle = X86ISD::MOVSLDUP;
24822 ShuffleVT = MVT::v16f32;
24825 if (isTargetShuffleEquivalent(
24826 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
24827 Shuffle = X86ISD::MOVSHDUP;
24828 ShuffleVT = MVT::v16f32;
24833 // Attempt to match against broadcast-from-vector.
24834 if (Subtarget.hasAVX2()) {
24835 unsigned NumElts = Mask.size();
24836 SmallVector<int, 64> BroadcastMask(NumElts, 0);
24837 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
24838 unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
24839 ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
24840 : MVT::getIntegerVT(EltSize);
24841 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
24842 Shuffle = X86ISD::VBROADCAST;
24850 // Attempt to match a combined shuffle mask against supported unary immediate
24851 // permute instructions.
24852 // TODO: Investigate sharing more of this with shuffle lowering.
24853 static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24854 const X86Subtarget &Subtarget,
24855 unsigned &Shuffle, MVT &ShuffleVT,
24856 unsigned &PermuteImm) {
24857 // Ensure we don't contain any zero elements.
24858 for (int M : Mask) {
24859 if (M == SM_SentinelZero)
24861 assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
24862 "Expected unary shuffle");
24865 unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
24866 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
24868 // Handle PSHUFLW/PSHUFHW repeated patterns.
24869 if (MaskScalarSizeInBits == 16) {
24870 SmallVector<int, 4> RepeatedMask;
24871 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
24872 ArrayRef<int> LoMask(Mask.data() + 0, 4);
24873 ArrayRef<int> HiMask(Mask.data() + 4, 4);
24875 // PSHUFLW: permute lower 4 elements only.
24876 if (isUndefOrInRange(LoMask, 0, 4) &&
24877 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
24878 Shuffle = X86ISD::PSHUFLW;
24879 ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24880 PermuteImm = getV4X86ShuffleImm(LoMask);
24884 // PSHUFHW: permute upper 4 elements only.
24885 if (isUndefOrInRange(HiMask, 4, 8) &&
24886 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
24887 // Offset the HiMask so that we can create the shuffle immediate.
24888 int OffsetHiMask[4];
24889 for (int i = 0; i != 4; ++i)
24890 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
24892 Shuffle = X86ISD::PSHUFHW;
24893 ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24894 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
24903 // We only support permutation of 32/64 bit elements after this.
24904 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
24907 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
24908 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
24909 bool FloatDomain = SrcVT.isFloatingPoint();
24910 if (FloatDomain && !Subtarget.hasAVX())
24913 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
24914 if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
24915 FloatDomain = true;
24917 // Check for lane crossing permutes.
24918 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
24919 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
24920 if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
24921 Shuffle = X86ISD::VPERMI;
24922 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
24923 PermuteImm = getV4X86ShuffleImm(Mask);
24926 if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
24927 SmallVector<int, 4> RepeatedMask;
24928 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
24929 Shuffle = X86ISD::VPERMI;
24930 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
24931 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
24938 // VPERMILPD can permute with a non-repeating shuffle.
24939 if (FloatDomain && MaskScalarSizeInBits == 64) {
24940 Shuffle = X86ISD::VPERMILPI;
24941 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
24943 for (int i = 0, e = Mask.size(); i != e; ++i) {
24945 if (M == SM_SentinelUndef)
24947 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
24948 PermuteImm |= (M & 1) << i;
24953 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
24954 SmallVector<int, 4> RepeatedMask;
24955 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
24958 // Narrow the repeated mask for 32-bit element permutes.
24959 SmallVector<int, 4> WordMask = RepeatedMask;
24960 if (MaskScalarSizeInBits == 64)
24961 scaleShuffleMask(2, RepeatedMask, WordMask);
24963 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
24964 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
24965 ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
24966 PermuteImm = getV4X86ShuffleImm(WordMask);
24970 // Attempt to match a combined unary shuffle mask against supported binary
24971 // shuffle instructions.
24972 // TODO: Investigate sharing more of this with shuffle lowering.
24973 static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24974 unsigned &Shuffle, MVT &ShuffleVT) {
24975 bool FloatDomain = SrcVT.isFloatingPoint();
24977 if (SrcVT.is128BitVector()) {
24978 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
24979 Shuffle = X86ISD::MOVLHPS;
24980 ShuffleVT = MVT::v4f32;
24983 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
24984 Shuffle = X86ISD::MOVHLPS;
24985 ShuffleVT = MVT::v4f32;
24988 if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
24989 Shuffle = X86ISD::UNPCKL;
24990 ShuffleVT = MVT::v4f32;
24993 if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
24994 Shuffle = X86ISD::UNPCKH;
24995 ShuffleVT = MVT::v4f32;
24998 if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
24999 isTargetShuffleEquivalent(
25000 Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
25001 Shuffle = X86ISD::UNPCKL;
25002 ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
25005 if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
25006 isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
25007 13, 14, 14, 15, 15})) {
25008 Shuffle = X86ISD::UNPCKH;
25009 ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
25017 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
25020 /// This is the leaf of the recursive combine below. When we have found some
25021 /// chain of single-use x86 shuffle instructions and accumulated the combined
25022 /// shuffle mask represented by them, this will try to pattern match that mask
25023 /// into either a single instruction if there is a special purpose instruction
25024 /// for this operation, or into a PSHUFB instruction which is a fully general
25025 /// instruction but should only be used to replace chains over a certain depth.
25026 static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
25027 ArrayRef<int> BaseMask, int Depth,
25028 bool HasVariableMask, SelectionDAG &DAG,
25029 TargetLowering::DAGCombinerInfo &DCI,
25030 const X86Subtarget &Subtarget) {
25031 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
25033 // Find the operand that enters the chain. Note that multiple uses are OK
25034 // here, we're not going to remove the operand we find.
25035 Input = peekThroughBitcasts(Input);
25037 MVT VT = Input.getSimpleValueType();
25038 MVT RootVT = Root.getSimpleValueType();
25043 unsigned NumBaseMaskElts = BaseMask.size();
25044 if (NumBaseMaskElts == 1) {
25045 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
25046 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
25051 unsigned RootSizeInBits = RootVT.getSizeInBits();
25052 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
25054 // Don't combine if we are a AVX512/EVEX target and the mask element size
25055 // is different from the root element size - this would prevent writemasks
25056 // from being reused.
25057 // TODO - this currently prevents all lane shuffles from occurring.
25058 // TODO - check for writemasks usage instead of always preventing combining.
25059 // TODO - attempt to narrow Mask back to writemask size.
25060 if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
25061 (RootSizeInBits == 512 ||
25062 (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
25066 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
25068 // Handle 128-bit lane shuffles of 256-bit vectors.
25069 if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
25070 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
25071 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
25072 return false; // Nothing to do!
25073 MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
25075 unsigned PermMask = 0;
25076 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
25077 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
25079 Res = DAG.getBitcast(ShuffleVT, Input);
25080 DCI.AddToWorklist(Res.getNode());
25081 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
25082 DAG.getUNDEF(ShuffleVT),
25083 DAG.getConstant(PermMask, DL, MVT::i8));
25084 DCI.AddToWorklist(Res.getNode());
25085 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25090 // For masks that have been widened to 128-bit elements or more,
25091 // narrow back down to 64-bit elements.
25092 SmallVector<int, 64> Mask;
25093 if (BaseMaskEltSizeInBits > 64) {
25094 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
25095 int MaskScale = BaseMaskEltSizeInBits / 64;
25096 scaleShuffleMask(MaskScale, BaseMask, Mask);
25098 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
25101 unsigned NumMaskElts = Mask.size();
25102 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
25104 // Determine the effective mask value type.
25106 (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
25107 (32 <= MaskEltSizeInBits);
25108 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
25109 : MVT::getIntegerVT(MaskEltSizeInBits);
25110 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
25112 // Attempt to match the mask against known shuffle patterns.
25114 unsigned Shuffle, PermuteImm;
25116 if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
25117 if (Depth == 1 && Root.getOpcode() == Shuffle)
25118 return false; // Nothing to do!
25119 Res = DAG.getBitcast(ShuffleVT, Input);
25120 DCI.AddToWorklist(Res.getNode());
25121 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
25122 DCI.AddToWorklist(Res.getNode());
25123 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25128 if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
25130 if (Depth == 1 && Root.getOpcode() == Shuffle)
25131 return false; // Nothing to do!
25132 Res = DAG.getBitcast(ShuffleVT, Input);
25133 DCI.AddToWorklist(Res.getNode());
25134 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
25135 DAG.getConstant(PermuteImm, DL, MVT::i8));
25136 DCI.AddToWorklist(Res.getNode());
25137 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25142 if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
25143 if (Depth == 1 && Root.getOpcode() == Shuffle)
25144 return false; // Nothing to do!
25145 Res = DAG.getBitcast(ShuffleVT, Input);
25146 DCI.AddToWorklist(Res.getNode());
25147 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
25148 DCI.AddToWorklist(Res.getNode());
25149 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25154 // Attempt to blend with zero.
25155 if (NumMaskElts <= 8 &&
25156 ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
25157 (Subtarget.hasAVX() && VT.is256BitVector()))) {
25158 // Convert VT to a type compatible with X86ISD::BLENDI.
25159 // TODO - add 16i16 support (requires lane duplication).
25160 MVT ShuffleVT = MaskVT;
25161 if (Subtarget.hasAVX2()) {
25162 if (ShuffleVT == MVT::v4i64)
25163 ShuffleVT = MVT::v8i32;
25164 else if (ShuffleVT == MVT::v2i64)
25165 ShuffleVT = MVT::v4i32;
25167 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
25168 ShuffleVT = MVT::v8i16;
25169 else if (ShuffleVT == MVT::v4i64)
25170 ShuffleVT = MVT::v4f64;
25171 else if (ShuffleVT == MVT::v8i32)
25172 ShuffleVT = MVT::v8f32;
25175 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
25177 NumMaskElts <= ShuffleVT.getVectorNumElements()) {
25178 unsigned BlendMask = 0;
25179 unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
25180 unsigned MaskRatio = ShuffleSize / NumMaskElts;
25182 if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
25185 for (unsigned i = 0; i != ShuffleSize; ++i)
25186 if (Mask[i / MaskRatio] < 0)
25187 BlendMask |= 1u << i;
25189 SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
25190 Res = DAG.getBitcast(ShuffleVT, Input);
25191 DCI.AddToWorklist(Res.getNode());
25192 Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
25193 DAG.getConstant(BlendMask, DL, MVT::i8));
25194 DCI.AddToWorklist(Res.getNode());
25195 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25201 // Attempt to combine to INSERTPS.
25202 if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
25203 (VT == MVT::v2f64 || VT == MVT::v4f32)) {
25204 SmallBitVector Zeroable(4, false);
25205 for (unsigned i = 0; i != NumMaskElts; ++i)
25207 Zeroable[i] = true;
25209 unsigned InsertPSMask;
25210 SDValue V1 = Input, V2 = Input;
25211 if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
25212 Zeroable, Mask, DAG)) {
25213 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
25214 return false; // Nothing to do!
25215 V1 = DAG.getBitcast(MVT::v4f32, V1);
25216 DCI.AddToWorklist(V1.getNode());
25217 V2 = DAG.getBitcast(MVT::v4f32, V2);
25218 DCI.AddToWorklist(V2.getNode());
25219 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
25220 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25221 DCI.AddToWorklist(Res.getNode());
25222 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25228 // Don't try to re-form single instruction chains under any circumstances now
25229 // that we've done encoding canonicalization for them.
25233 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
25236 bool MaskContainsZeros =
25237 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
25239 // If we have a single input shuffle with different shuffle patterns in the
25240 // the 128-bit lanes use the variable mask to VPERMILPS.
25241 // TODO Combine other mask types at higher depths.
25242 if (HasVariableMask && !MaskContainsZeros &&
25243 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
25244 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
25245 SmallVector<SDValue, 16> VPermIdx;
25246 for (int M : Mask) {
25248 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
25249 VPermIdx.push_back(Idx);
25251 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
25252 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
25253 DCI.AddToWorklist(VPermMask.getNode());
25254 Res = DAG.getBitcast(MaskVT, Input);
25255 DCI.AddToWorklist(Res.getNode());
25256 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
25257 DCI.AddToWorklist(Res.getNode());
25258 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25263 // If we have 3 or more shuffle instructions or a chain involving a variable
25264 // mask, we can replace them with a single PSHUFB instruction profitably.
25265 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
25266 // instructions, but in practice PSHUFB tends to be *very* fast so we're
25267 // more aggressive.
25268 if ((Depth >= 3 || HasVariableMask) &&
25269 ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
25270 (VT.is256BitVector() && Subtarget.hasAVX2()) ||
25271 (VT.is512BitVector() && Subtarget.hasBWI()))) {
25272 SmallVector<SDValue, 16> PSHUFBMask;
25273 int NumBytes = VT.getSizeInBits() / 8;
25274 int Ratio = NumBytes / NumMaskElts;
25275 for (int i = 0; i < NumBytes; ++i) {
25276 int M = Mask[i / Ratio];
25277 if (M == SM_SentinelUndef) {
25278 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
25281 if (M == SM_SentinelZero) {
25282 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
25285 M = Ratio * M + i % Ratio;
25286 assert ((M / 16) == (i / 16) && "Lane crossing detected");
25287 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
25289 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
25290 Res = DAG.getBitcast(ByteVT, Input);
25291 DCI.AddToWorklist(Res.getNode());
25292 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
25293 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
25294 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
25295 DCI.AddToWorklist(Res.getNode());
25296 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25301 // Failed to find any combines.
25305 /// \brief Fully generic combining of x86 shuffle instructions.
25307 /// This should be the last combine run over the x86 shuffle instructions. Once
25308 /// they have been fully optimized, this will recursively consider all chains
25309 /// of single-use shuffle instructions, build a generic model of the cumulative
25310 /// shuffle operation, and check for simpler instructions which implement this
25311 /// operation. We use this primarily for two purposes:
25313 /// 1) Collapse generic shuffles to specialized single instructions when
25314 /// equivalent. In most cases, this is just an encoding size win, but
25315 /// sometimes we will collapse multiple generic shuffles into a single
25316 /// special-purpose shuffle.
25317 /// 2) Look for sequences of shuffle instructions with 3 or more total
25318 /// instructions, and replace them with the slightly more expensive SSSE3
25319 /// PSHUFB instruction if available. We do this as the last combining step
25320 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
25321 /// a suitable short sequence of other instructions. The PHUFB will either
25322 /// use a register or have to read from memory and so is slightly (but only
25323 /// slightly) more expensive than the other shuffle instructions.
25325 /// Because this is inherently a quadratic operation (for each shuffle in
25326 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
25327 /// This should never be an issue in practice as the shuffle lowering doesn't
25328 /// produce sequences of more than 8 instructions.
25330 /// FIXME: We will currently miss some cases where the redundant shuffling
25331 /// would simplify under the threshold for PSHUFB formation because of
25332 /// combine-ordering. To fix this, we should do the redundant instruction
25333 /// combining in this recursive walk.
25334 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
25335 ArrayRef<int> RootMask,
25336 int Depth, bool HasVariableMask,
25338 TargetLowering::DAGCombinerInfo &DCI,
25339 const X86Subtarget &Subtarget) {
25340 // Bound the depth of our recursive combine because this is ultimately
25341 // quadratic in nature.
25345 // Directly rip through bitcasts to find the underlying operand.
25346 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
25347 Op = Op.getOperand(0);
25349 MVT VT = Op.getSimpleValueType();
25350 if (!VT.isVector())
25351 return false; // Bail if we hit a non-vector.
25353 assert(Root.getSimpleValueType().isVector() &&
25354 "Shuffles operate on vector types!");
25355 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
25356 "Can only combine shuffles of the same vector register size.");
25358 // Extract target shuffle mask and resolve sentinels and inputs.
25359 SDValue Input0, Input1;
25360 SmallVector<int, 16> OpMask;
25361 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
25364 assert(VT.getVectorNumElements() == OpMask.size() &&
25365 "Different mask size from vector size!");
25366 assert(((RootMask.size() > OpMask.size() &&
25367 RootMask.size() % OpMask.size() == 0) ||
25368 (OpMask.size() > RootMask.size() &&
25369 OpMask.size() % RootMask.size() == 0) ||
25370 OpMask.size() == RootMask.size()) &&
25371 "The smaller number of elements must divide the larger.");
25372 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
25373 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
25374 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
25375 assert(((RootRatio == 1 && OpRatio == 1) ||
25376 (RootRatio == 1) != (OpRatio == 1)) &&
25377 "Must not have a ratio for both incoming and op masks!");
25379 SmallVector<int, 16> Mask;
25380 Mask.reserve(MaskWidth);
25382 // Merge this shuffle operation's mask into our accumulated mask. Note that
25383 // this shuffle's mask will be the first applied to the input, followed by the
25384 // root mask to get us all the way to the root value arrangement. The reason
25385 // for this order is that we are recursing up the operation chain.
25386 for (int i = 0; i < MaskWidth; ++i) {
25387 int RootIdx = i / RootRatio;
25388 if (RootMask[RootIdx] < 0) {
25389 // This is a zero or undef lane, we're done.
25390 Mask.push_back(RootMask[RootIdx]);
25394 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
25395 int OpIdx = RootMaskedIdx / OpRatio;
25396 if (OpMask[OpIdx] < 0) {
25397 // The incoming lanes are zero or undef, it doesn't matter which ones we
25399 Mask.push_back(OpMask[OpIdx]);
25403 // Ok, we have non-zero lanes, map them through.
25404 Mask.push_back(OpMask[OpIdx] * OpRatio +
25405 RootMaskedIdx % OpRatio);
25408 // Handle the all undef/zero cases early.
25409 if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
25410 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
25413 if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
25414 // TODO - should we handle the mixed zero/undef case as well? Just returning
25415 // a zero mask will lose information on undef elements possibly reducing
25416 // future combine possibilities.
25417 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
25418 Subtarget, DAG, SDLoc(Root)));
25422 int MaskSize = Mask.size();
25423 bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
25424 [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
25425 bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
25426 [MaskSize](int Idx) { return MaskSize <= Idx; });
25428 // At the moment we can only combine unary shuffle mask cases.
25429 if (UseInput0 && UseInput1)
25431 else if (UseInput1) {
25432 std::swap(Input0, Input1);
25433 ShuffleVectorSDNode::commuteMask(Mask);
25436 assert(Input0 && "Shuffle with no inputs detected");
25438 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
25440 // See if we can recurse into Input0 (if it's a target shuffle).
25441 if (Op->isOnlyUserOf(Input0.getNode()) &&
25442 combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
25443 HasVariableMask, DAG, DCI, Subtarget))
25446 // Minor canonicalization of the accumulated shuffle mask to make it easier
25447 // to match below. All this does is detect masks with sequential pairs of
25448 // elements, and shrink them to the half-width mask. It does this in a loop
25449 // so it will reduce the size of the mask to the minimal width mask which
25450 // performs an equivalent shuffle.
25451 SmallVector<int, 16> WidenedMask;
25452 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
25453 Mask = std::move(WidenedMask);
25456 return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
25460 /// \brief Get the PSHUF-style mask from PSHUF node.
25462 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
25463 /// PSHUF-style masks that can be reused with such instructions.
25464 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
25465 MVT VT = N.getSimpleValueType();
25466 SmallVector<int, 4> Mask;
25467 SmallVector<SDValue, 2> Ops;
25470 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
25474 // If we have more than 128-bits, only the low 128-bits of shuffle mask
25475 // matter. Check that the upper masks are repeats and remove them.
25476 if (VT.getSizeInBits() > 128) {
25477 int LaneElts = 128 / VT.getScalarSizeInBits();
25479 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
25480 for (int j = 0; j < LaneElts; ++j)
25481 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
25482 "Mask doesn't repeat in high 128-bit lanes!");
25484 Mask.resize(LaneElts);
25487 switch (N.getOpcode()) {
25488 case X86ISD::PSHUFD:
25490 case X86ISD::PSHUFLW:
25493 case X86ISD::PSHUFHW:
25494 Mask.erase(Mask.begin(), Mask.begin() + 4);
25495 for (int &M : Mask)
25499 llvm_unreachable("No valid shuffle instruction found!");
25503 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
25505 /// We walk up the chain and look for a combinable shuffle, skipping over
25506 /// shuffles that we could hoist this shuffle's transformation past without
25507 /// altering anything.
25509 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
25511 TargetLowering::DAGCombinerInfo &DCI) {
25512 assert(N.getOpcode() == X86ISD::PSHUFD &&
25513 "Called with something other than an x86 128-bit half shuffle!");
25516 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
25517 // of the shuffles in the chain so that we can form a fresh chain to replace
25519 SmallVector<SDValue, 8> Chain;
25520 SDValue V = N.getOperand(0);
25521 for (; V.hasOneUse(); V = V.getOperand(0)) {
25522 switch (V.getOpcode()) {
25524 return SDValue(); // Nothing combined!
25527 // Skip bitcasts as we always know the type for the target specific
25531 case X86ISD::PSHUFD:
25532 // Found another dword shuffle.
25535 case X86ISD::PSHUFLW:
25536 // Check that the low words (being shuffled) are the identity in the
25537 // dword shuffle, and the high words are self-contained.
25538 if (Mask[0] != 0 || Mask[1] != 1 ||
25539 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
25542 Chain.push_back(V);
25545 case X86ISD::PSHUFHW:
25546 // Check that the high words (being shuffled) are the identity in the
25547 // dword shuffle, and the low words are self-contained.
25548 if (Mask[2] != 2 || Mask[3] != 3 ||
25549 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
25552 Chain.push_back(V);
25555 case X86ISD::UNPCKL:
25556 case X86ISD::UNPCKH:
25557 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
25558 // shuffle into a preceding word shuffle.
25559 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
25560 V.getSimpleValueType().getVectorElementType() != MVT::i16)
25563 // Search for a half-shuffle which we can combine with.
25564 unsigned CombineOp =
25565 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
25566 if (V.getOperand(0) != V.getOperand(1) ||
25567 !V->isOnlyUserOf(V.getOperand(0).getNode()))
25569 Chain.push_back(V);
25570 V = V.getOperand(0);
25572 switch (V.getOpcode()) {
25574 return SDValue(); // Nothing to combine.
25576 case X86ISD::PSHUFLW:
25577 case X86ISD::PSHUFHW:
25578 if (V.getOpcode() == CombineOp)
25581 Chain.push_back(V);
25585 V = V.getOperand(0);
25589 } while (V.hasOneUse());
25592 // Break out of the loop if we break out of the switch.
25596 if (!V.hasOneUse())
25597 // We fell out of the loop without finding a viable combining instruction.
25600 // Merge this node's mask and our incoming mask.
25601 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25602 for (int &M : Mask)
25604 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
25605 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25607 // Rebuild the chain around this new shuffle.
25608 while (!Chain.empty()) {
25609 SDValue W = Chain.pop_back_val();
25611 if (V.getValueType() != W.getOperand(0).getValueType())
25612 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
25614 switch (W.getOpcode()) {
25616 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
25618 case X86ISD::UNPCKL:
25619 case X86ISD::UNPCKH:
25620 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
25623 case X86ISD::PSHUFD:
25624 case X86ISD::PSHUFLW:
25625 case X86ISD::PSHUFHW:
25626 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
25630 if (V.getValueType() != N.getValueType())
25631 V = DAG.getBitcast(N.getValueType(), V);
25633 // Return the new chain to replace N.
25637 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
25640 /// We walk up the chain, skipping shuffles of the other half and looking
25641 /// through shuffles which switch halves trying to find a shuffle of the same
25642 /// pair of dwords.
25643 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
25645 TargetLowering::DAGCombinerInfo &DCI) {
25647 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
25648 "Called with something other than an x86 128-bit half shuffle!");
25650 unsigned CombineOpcode = N.getOpcode();
25652 // Walk up a single-use chain looking for a combinable shuffle.
25653 SDValue V = N.getOperand(0);
25654 for (; V.hasOneUse(); V = V.getOperand(0)) {
25655 switch (V.getOpcode()) {
25657 return false; // Nothing combined!
25660 // Skip bitcasts as we always know the type for the target specific
25664 case X86ISD::PSHUFLW:
25665 case X86ISD::PSHUFHW:
25666 if (V.getOpcode() == CombineOpcode)
25669 // Other-half shuffles are no-ops.
25672 // Break out of the loop if we break out of the switch.
25676 if (!V.hasOneUse())
25677 // We fell out of the loop without finding a viable combining instruction.
25680 // Combine away the bottom node as its shuffle will be accumulated into
25681 // a preceding shuffle.
25682 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25684 // Record the old value.
25687 // Merge this node's mask and our incoming mask (adjusted to account for all
25688 // the pshufd instructions encountered).
25689 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25690 for (int &M : Mask)
25692 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
25693 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25695 // Check that the shuffles didn't cancel each other out. If not, we need to
25696 // combine to the new one.
25698 // Replace the combinable shuffle with the combined one, updating all users
25699 // so that we re-evaluate the chain here.
25700 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
25705 /// \brief Try to combine x86 target specific shuffles.
25706 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
25707 TargetLowering::DAGCombinerInfo &DCI,
25708 const X86Subtarget &Subtarget) {
25710 MVT VT = N.getSimpleValueType();
25711 SmallVector<int, 4> Mask;
25713 switch (N.getOpcode()) {
25714 case X86ISD::PSHUFD:
25715 case X86ISD::PSHUFLW:
25716 case X86ISD::PSHUFHW:
25717 Mask = getPSHUFShuffleMask(N);
25718 assert(Mask.size() == 4);
25720 case X86ISD::UNPCKL: {
25721 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
25722 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
25723 // moves upper half elements into the lower half part. For example:
25725 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
25727 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
25729 // will be combined to:
25731 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
25733 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
25734 // happen due to advanced instructions.
25735 if (!VT.is128BitVector())
25738 auto Op0 = N.getOperand(0);
25739 auto Op1 = N.getOperand(1);
25740 if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
25741 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
25743 unsigned NumElts = VT.getVectorNumElements();
25744 SmallVector<int, 8> ExpectedMask(NumElts, -1);
25745 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
25748 auto ShufOp = Op1.getOperand(0);
25749 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
25750 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
25754 case X86ISD::BLENDI: {
25755 SDValue V0 = N->getOperand(0);
25756 SDValue V1 = N->getOperand(1);
25757 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
25758 "Unexpected input vector types");
25760 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
25761 // operands and changing the mask to 1. This saves us a bunch of
25762 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
25763 // x86InstrInfo knows how to commute this back after instruction selection
25764 // if it would help register allocation.
25766 // TODO: If optimizing for size or a processor that doesn't suffer from
25767 // partial register update stalls, this should be transformed into a MOVSD
25768 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
25770 if (VT == MVT::v2f64)
25771 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
25772 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
25773 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
25774 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
25777 // Attempt to merge blend(insertps(x,y),zero).
25778 if (V0.getOpcode() == X86ISD::INSERTPS ||
25779 V1.getOpcode() == X86ISD::INSERTPS) {
25780 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25782 // Determine which elements are known to be zero.
25783 SmallVector<int, 8> TargetMask;
25784 SmallVector<SDValue, 2> BlendOps;
25785 if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
25788 // Helper function to take inner insertps node and attempt to
25789 // merge the blend with zero into its zero mask.
25790 auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
25791 if (V.getOpcode() != X86ISD::INSERTPS)
25793 SDValue Op0 = V.getOperand(0);
25794 SDValue Op1 = V.getOperand(1);
25795 SDValue Op2 = V.getOperand(2);
25796 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25798 // Check each element of the blend node's target mask - must either
25799 // be zeroable (and update the zero mask) or selects the element from
25800 // the inner insertps node.
25801 for (int i = 0; i != 4; ++i)
25802 if (TargetMask[i] < 0)
25803 InsertPSMask |= (1u << i);
25804 else if (TargetMask[i] != (i + Offset))
25806 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
25807 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25810 if (SDValue V = MergeInsertPSAndBlend(V0, 0))
25812 if (SDValue V = MergeInsertPSAndBlend(V1, 4))
25817 case X86ISD::INSERTPS: {
25818 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25819 SDValue Op0 = N.getOperand(0);
25820 SDValue Op1 = N.getOperand(1);
25821 SDValue Op2 = N.getOperand(2);
25822 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25823 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
25824 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
25825 unsigned ZeroMask = InsertPSMask & 0xF;
25827 // If we zero out all elements from Op0 then we don't need to reference it.
25828 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
25829 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
25830 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25832 // If we zero out the element from Op1 then we don't need to reference it.
25833 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
25834 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25835 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25837 // Attempt to merge insertps Op1 with an inner target shuffle node.
25838 SmallVector<int, 8> TargetMask1;
25839 SmallVector<SDValue, 2> Ops1;
25840 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
25841 int M = TargetMask1[SrcIdx];
25842 if (isUndefOrZero(M)) {
25843 // Zero/UNDEF insertion - zero out element and remove dependency.
25844 InsertPSMask |= (1u << DstIdx);
25845 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25846 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25848 // Update insertps mask srcidx and reference the source input directly.
25849 assert(0 <= M && M < 8 && "Shuffle index out of range");
25850 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
25851 Op1 = Ops1[M < 4 ? 0 : 1];
25852 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25853 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25856 // Attempt to merge insertps Op0 with an inner target shuffle node.
25857 SmallVector<int, 8> TargetMask0;
25858 SmallVector<SDValue, 2> Ops0;
25859 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
25862 bool Updated = false;
25863 bool UseInput00 = false;
25864 bool UseInput01 = false;
25865 for (int i = 0; i != 4; ++i) {
25866 int M = TargetMask0[i];
25867 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
25868 // No change if element is already zero or the inserted element.
25870 } else if (isUndefOrZero(M)) {
25871 // If the target mask is undef/zero then we must zero the element.
25872 InsertPSMask |= (1u << i);
25877 // The input vector element must be inline.
25878 if (M != i && M != (i + 4))
25881 // Determine which inputs of the target shuffle we're using.
25882 UseInput00 |= (0 <= M && M < 4);
25883 UseInput01 |= (4 <= M);
25886 // If we're not using both inputs of the target shuffle then use the
25887 // referenced input directly.
25888 if (UseInput00 && !UseInput01) {
25891 } else if (!UseInput00 && UseInput01) {
25897 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25898 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25906 // Nuke no-op shuffles that show up after combining.
25907 if (isNoopShuffleMask(Mask))
25908 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25910 // Look for simplifications involving one or two shuffle instructions.
25911 SDValue V = N.getOperand(0);
25912 switch (N.getOpcode()) {
25915 case X86ISD::PSHUFLW:
25916 case X86ISD::PSHUFHW:
25917 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
25919 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
25920 return SDValue(); // We combined away this shuffle, so we're done.
25922 // See if this reduces to a PSHUFD which is no more expensive and can
25923 // combine with more operations. Note that it has to at least flip the
25924 // dwords as otherwise it would have been removed as a no-op.
25925 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
25926 int DMask[] = {0, 1, 2, 3};
25927 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
25928 DMask[DOffset + 0] = DOffset + 1;
25929 DMask[DOffset + 1] = DOffset + 0;
25930 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
25931 V = DAG.getBitcast(DVT, V);
25932 DCI.AddToWorklist(V.getNode());
25933 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
25934 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
25935 DCI.AddToWorklist(V.getNode());
25936 return DAG.getBitcast(VT, V);
25939 // Look for shuffle patterns which can be implemented as a single unpack.
25940 // FIXME: This doesn't handle the location of the PSHUFD generically, and
25941 // only works when we have a PSHUFD followed by two half-shuffles.
25942 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
25943 (V.getOpcode() == X86ISD::PSHUFLW ||
25944 V.getOpcode() == X86ISD::PSHUFHW) &&
25945 V.getOpcode() != N.getOpcode() &&
25947 SDValue D = V.getOperand(0);
25948 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
25949 D = D.getOperand(0);
25950 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
25951 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25952 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
25953 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25954 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25956 for (int i = 0; i < 4; ++i) {
25957 WordMask[i + NOffset] = Mask[i] + NOffset;
25958 WordMask[i + VOffset] = VMask[i] + VOffset;
25960 // Map the word mask through the DWord mask.
25962 for (int i = 0; i < 8; ++i)
25963 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
25964 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
25965 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
25966 // We can replace all three shuffles with an unpack.
25967 V = DAG.getBitcast(VT, D.getOperand(0));
25968 DCI.AddToWorklist(V.getNode());
25969 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
25978 case X86ISD::PSHUFD:
25979 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
25988 /// \brief Try to combine a shuffle into a target-specific add-sub node.
25990 /// We combine this directly on the abstract vector shuffle nodes so it is
25991 /// easier to generically match. We also insert dummy vector shuffle nodes for
25992 /// the operands which explicitly discard the lanes which are unused by this
25993 /// operation to try to flow through the rest of the combiner the fact that
25994 /// they're unused.
25995 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
25996 SelectionDAG &DAG) {
25998 EVT VT = N->getValueType(0);
25999 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
26000 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
26003 // We only handle target-independent shuffles.
26004 // FIXME: It would be easy and harmless to use the target shuffle mask
26005 // extraction tool to support more.
26006 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
26009 auto *SVN = cast<ShuffleVectorSDNode>(N);
26010 SmallVector<int, 8> Mask;
26011 for (int M : SVN->getMask())
26014 SDValue V1 = N->getOperand(0);
26015 SDValue V2 = N->getOperand(1);
26017 // We require the first shuffle operand to be the FSUB node, and the second to
26018 // be the FADD node.
26019 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
26020 ShuffleVectorSDNode::commuteMask(Mask);
26022 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
26025 // If there are other uses of these operations we can't fold them.
26026 if (!V1->hasOneUse() || !V2->hasOneUse())
26029 // Ensure that both operations have the same operands. Note that we can
26030 // commute the FADD operands.
26031 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
26032 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
26033 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
26036 // We're looking for blends between FADD and FSUB nodes. We insist on these
26037 // nodes being lined up in a specific expected pattern.
26038 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
26039 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
26040 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
26043 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
26046 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
26047 TargetLowering::DAGCombinerInfo &DCI,
26048 const X86Subtarget &Subtarget) {
26050 EVT VT = N->getValueType(0);
26052 // Don't create instructions with illegal types after legalize types has run.
26053 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26054 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
26057 // If we have legalized the vector types, look for blends of FADD and FSUB
26058 // nodes that we can fuse into an ADDSUB node.
26059 if (TLI.isTypeLegal(VT))
26060 if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
26063 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
26064 if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
26065 N->getOpcode() == ISD::VECTOR_SHUFFLE)
26066 return combineShuffle256(N, DAG, DCI, Subtarget);
26068 // During Type Legalization, when promoting illegal vector types,
26069 // the backend might introduce new shuffle dag nodes and bitcasts.
26071 // This code performs the following transformation:
26072 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
26073 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
26075 // We do this only if both the bitcast and the BINOP dag nodes have
26076 // one use. Also, perform this transformation only if the new binary
26077 // operation is legal. This is to avoid introducing dag nodes that
26078 // potentially need to be further expanded (or custom lowered) into a
26079 // less optimal sequence of dag nodes.
26080 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
26081 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
26082 N->getOperand(0).getOpcode() == ISD::BITCAST &&
26083 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
26084 SDValue N0 = N->getOperand(0);
26085 SDValue N1 = N->getOperand(1);
26087 SDValue BC0 = N0.getOperand(0);
26088 EVT SVT = BC0.getValueType();
26089 unsigned Opcode = BC0.getOpcode();
26090 unsigned NumElts = VT.getVectorNumElements();
26092 if (BC0.hasOneUse() && SVT.isVector() &&
26093 SVT.getVectorNumElements() * 2 == NumElts &&
26094 TLI.isOperationLegal(Opcode, VT)) {
26095 bool CanFold = false;
26107 unsigned SVTNumElts = SVT.getVectorNumElements();
26108 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
26109 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
26110 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
26111 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
26112 CanFold = SVOp->getMaskElt(i) < 0;
26115 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
26116 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
26117 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
26118 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
26123 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
26124 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
26125 // consecutive, non-overlapping, and in the right order.
26126 SmallVector<SDValue, 16> Elts;
26127 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
26128 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
26130 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
26133 if (isTargetShuffle(N->getOpcode())) {
26134 if (SDValue Shuffle =
26135 combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
26138 // Try recursively combining arbitrary sequences of x86 shuffle
26139 // instructions into higher-order shuffles. We do this after combining
26140 // specific PSHUF instruction sequences into their minimal form so that we
26141 // can evaluate how many specialized shuffle instructions are involved in
26142 // a particular chain.
26143 SmallVector<int, 1> NonceMask; // Just a placeholder.
26144 NonceMask.push_back(0);
26145 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
26146 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
26148 return SDValue(); // This routine will use CombineTo to replace N.
26154 /// Check if a vector extract from a target-specific shuffle of a load can be
26155 /// folded into a single element load.
26156 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
26157 /// shuffles have been custom lowered so we need to handle those here.
26158 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
26159 TargetLowering::DAGCombinerInfo &DCI) {
26160 if (DCI.isBeforeLegalizeOps())
26163 SDValue InVec = N->getOperand(0);
26164 SDValue EltNo = N->getOperand(1);
26165 EVT EltVT = N->getValueType(0);
26167 if (!isa<ConstantSDNode>(EltNo))
26170 EVT OriginalVT = InVec.getValueType();
26172 if (InVec.getOpcode() == ISD::BITCAST) {
26173 // Don't duplicate a load with other uses.
26174 if (!InVec.hasOneUse())
26176 EVT BCVT = InVec.getOperand(0).getValueType();
26177 if (!BCVT.isVector() ||
26178 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
26180 InVec = InVec.getOperand(0);
26183 EVT CurrentVT = InVec.getValueType();
26185 if (!isTargetShuffle(InVec.getOpcode()))
26188 // Don't duplicate a load with other uses.
26189 if (!InVec.hasOneUse())
26192 SmallVector<int, 16> ShuffleMask;
26193 SmallVector<SDValue, 2> ShuffleOps;
26195 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
26196 ShuffleOps, ShuffleMask, UnaryShuffle))
26199 // Select the input vector, guarding against out of range extract vector.
26200 unsigned NumElems = CurrentVT.getVectorNumElements();
26201 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
26202 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
26204 if (Idx == SM_SentinelZero)
26205 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
26206 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
26207 if (Idx == SM_SentinelUndef)
26208 return DAG.getUNDEF(EltVT);
26210 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
26211 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
26214 // If inputs to shuffle are the same for both ops, then allow 2 uses
26215 unsigned AllowedUses =
26216 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
26218 if (LdNode.getOpcode() == ISD::BITCAST) {
26219 // Don't duplicate a load with other uses.
26220 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
26223 AllowedUses = 1; // only allow 1 load use if we have a bitcast
26224 LdNode = LdNode.getOperand(0);
26227 if (!ISD::isNormalLoad(LdNode.getNode()))
26230 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
26232 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
26235 // If there's a bitcast before the shuffle, check if the load type and
26236 // alignment is valid.
26237 unsigned Align = LN0->getAlignment();
26238 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26239 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
26240 EltVT.getTypeForEVT(*DAG.getContext()));
26242 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
26245 // All checks match so transform back to vector_shuffle so that DAG combiner
26246 // can finish the job
26249 // Create shuffle node taking into account the case that its a unary shuffle
26250 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
26251 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
26253 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
26254 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
26258 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
26259 const X86Subtarget &Subtarget) {
26260 SDValue N0 = N->getOperand(0);
26261 EVT VT = N->getValueType(0);
26263 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
26264 // special and don't usually play with other vector types, it's better to
26265 // handle them early to be sure we emit efficient code by avoiding
26266 // store-load conversions.
26267 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
26268 N0.getValueType() == MVT::v2i32 &&
26269 isNullConstant(N0.getOperand(1))) {
26270 SDValue N00 = N0->getOperand(0);
26271 if (N00.getValueType() == MVT::i32)
26272 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
26275 // Convert a bitcasted integer logic operation that has one bitcasted
26276 // floating-point operand and one constant operand into a floating-point
26277 // logic operation. This may create a load of the constant, but that is
26278 // cheaper than materializing the constant in an integer register and
26279 // transferring it to an SSE register or transferring the SSE operand to
26280 // integer register and back.
26282 switch (N0.getOpcode()) {
26283 case ISD::AND: FPOpcode = X86ISD::FAND; break;
26284 case ISD::OR: FPOpcode = X86ISD::FOR; break;
26285 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
26286 default: return SDValue();
26288 if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
26289 (Subtarget.hasSSE2() && VT == MVT::f64)) &&
26290 isa<ConstantSDNode>(N0.getOperand(1)) &&
26291 N0.getOperand(0).getOpcode() == ISD::BITCAST &&
26292 N0.getOperand(0).getOperand(0).getValueType() == VT) {
26293 SDValue N000 = N0.getOperand(0).getOperand(0);
26294 SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
26295 return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
26301 /// Detect vector gather/scatter index generation and convert it from being a
26302 /// bunch of shuffles and extracts into a somewhat faster sequence.
26303 /// For i686, the best sequence is apparently storing the value and loading
26304 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
26305 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
26306 TargetLowering::DAGCombinerInfo &DCI) {
26307 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
26310 SDValue InputVector = N->getOperand(0);
26311 SDLoc dl(InputVector);
26312 // Detect mmx to i32 conversion through a v2i32 elt extract.
26313 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
26314 N->getValueType(0) == MVT::i32 &&
26315 InputVector.getValueType() == MVT::v2i32 &&
26316 isa<ConstantSDNode>(N->getOperand(1)) &&
26317 N->getConstantOperandVal(1) == 0) {
26318 SDValue MMXSrc = InputVector.getNode()->getOperand(0);
26320 // The bitcast source is a direct mmx result.
26321 if (MMXSrc.getValueType() == MVT::x86mmx)
26322 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
26325 EVT VT = N->getValueType(0);
26327 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
26328 InputVector.getOpcode() == ISD::BITCAST &&
26329 isa<ConstantSDNode>(InputVector.getOperand(0))) {
26330 uint64_t ExtractedElt =
26331 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
26332 uint64_t InputValue =
26333 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
26334 uint64_t Res = (InputValue >> ExtractedElt) & 1;
26335 return DAG.getConstant(Res, dl, MVT::i1);
26337 // Only operate on vectors of 4 elements, where the alternative shuffling
26338 // gets to be more expensive.
26339 if (InputVector.getValueType() != MVT::v4i32)
26342 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
26343 // single use which is a sign-extend or zero-extend, and all elements are
26345 SmallVector<SDNode *, 4> Uses;
26346 unsigned ExtractedElements = 0;
26347 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
26348 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
26349 if (UI.getUse().getResNo() != InputVector.getResNo())
26352 SDNode *Extract = *UI;
26353 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26356 if (Extract->getValueType(0) != MVT::i32)
26358 if (!Extract->hasOneUse())
26360 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
26361 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
26363 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
26366 // Record which element was extracted.
26367 ExtractedElements |=
26368 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
26370 Uses.push_back(Extract);
26373 // If not all the elements were used, this may not be worthwhile.
26374 if (ExtractedElements != 15)
26377 // Ok, we've now decided to do the transformation.
26378 // If 64-bit shifts are legal, use the extract-shift sequence,
26379 // otherwise bounce the vector off the cache.
26380 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26383 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
26384 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
26385 auto &DL = DAG.getDataLayout();
26386 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
26387 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26388 DAG.getConstant(0, dl, VecIdxTy));
26389 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26390 DAG.getConstant(1, dl, VecIdxTy));
26392 SDValue ShAmt = DAG.getConstant(
26393 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
26394 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
26395 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26396 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
26397 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
26398 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26399 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
26401 // Store the value to a temporary stack slot.
26402 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
26403 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
26404 MachinePointerInfo());
26406 EVT ElementType = InputVector.getValueType().getVectorElementType();
26407 unsigned EltSize = ElementType.getSizeInBits() / 8;
26409 // Replace each use (extract) with a load of the appropriate element.
26410 for (unsigned i = 0; i < 4; ++i) {
26411 uint64_t Offset = EltSize * i;
26412 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26413 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
26415 SDValue ScalarAddr =
26416 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
26418 // Load the scalar.
26420 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
26424 // Replace the extracts
26425 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
26426 UE = Uses.end(); UI != UE; ++UI) {
26427 SDNode *Extract = *UI;
26429 SDValue Idx = Extract->getOperand(1);
26430 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
26431 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
26434 // The replacement was made in place; don't return anything.
26438 /// Do target-specific dag combines on SELECT and VSELECT nodes.
26439 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
26440 TargetLowering::DAGCombinerInfo &DCI,
26441 const X86Subtarget &Subtarget) {
26443 SDValue Cond = N->getOperand(0);
26444 // Get the LHS/RHS of the select.
26445 SDValue LHS = N->getOperand(1);
26446 SDValue RHS = N->getOperand(2);
26447 EVT VT = LHS.getValueType();
26448 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26450 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
26451 // instructions match the semantics of the common C idiom x<y?x:y but not
26452 // x<=y?x:y, because of how they handle negative zero (which can be
26453 // ignored in unsafe-math mode).
26454 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
26455 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
26456 VT != MVT::f80 && VT != MVT::f128 &&
26457 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
26458 (Subtarget.hasSSE2() ||
26459 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
26460 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26462 unsigned Opcode = 0;
26463 // Check for x CC y ? x : y.
26464 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26465 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26469 // Converting this to a min would handle NaNs incorrectly, and swapping
26470 // the operands would cause it to handle comparisons between positive
26471 // and negative zero incorrectly.
26472 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26473 if (!DAG.getTarget().Options.UnsafeFPMath &&
26474 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26476 std::swap(LHS, RHS);
26478 Opcode = X86ISD::FMIN;
26481 // Converting this to a min would handle comparisons between positive
26482 // and negative zero incorrectly.
26483 if (!DAG.getTarget().Options.UnsafeFPMath &&
26484 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26486 Opcode = X86ISD::FMIN;
26489 // Converting this to a min would handle both negative zeros and NaNs
26490 // incorrectly, but we can swap the operands to fix both.
26491 std::swap(LHS, RHS);
26495 Opcode = X86ISD::FMIN;
26499 // Converting this to a max would handle comparisons between positive
26500 // and negative zero incorrectly.
26501 if (!DAG.getTarget().Options.UnsafeFPMath &&
26502 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26504 Opcode = X86ISD::FMAX;
26507 // Converting this to a max would handle NaNs incorrectly, and swapping
26508 // the operands would cause it to handle comparisons between positive
26509 // and negative zero incorrectly.
26510 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26511 if (!DAG.getTarget().Options.UnsafeFPMath &&
26512 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26514 std::swap(LHS, RHS);
26516 Opcode = X86ISD::FMAX;
26519 // Converting this to a max would handle both negative zeros and NaNs
26520 // incorrectly, but we can swap the operands to fix both.
26521 std::swap(LHS, RHS);
26525 Opcode = X86ISD::FMAX;
26528 // Check for x CC y ? y : x -- a min/max with reversed arms.
26529 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
26530 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
26534 // Converting this to a min would handle comparisons between positive
26535 // and negative zero incorrectly, and swapping the operands would
26536 // cause it to handle NaNs incorrectly.
26537 if (!DAG.getTarget().Options.UnsafeFPMath &&
26538 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
26539 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26541 std::swap(LHS, RHS);
26543 Opcode = X86ISD::FMIN;
26546 // Converting this to a min would handle NaNs incorrectly.
26547 if (!DAG.getTarget().Options.UnsafeFPMath &&
26548 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
26550 Opcode = X86ISD::FMIN;
26553 // Converting this to a min would handle both negative zeros and NaNs
26554 // incorrectly, but we can swap the operands to fix both.
26555 std::swap(LHS, RHS);
26559 Opcode = X86ISD::FMIN;
26563 // Converting this to a max would handle NaNs incorrectly.
26564 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26566 Opcode = X86ISD::FMAX;
26569 // Converting this to a max would handle comparisons between positive
26570 // and negative zero incorrectly, and swapping the operands would
26571 // cause it to handle NaNs incorrectly.
26572 if (!DAG.getTarget().Options.UnsafeFPMath &&
26573 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
26574 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26576 std::swap(LHS, RHS);
26578 Opcode = X86ISD::FMAX;
26581 // Converting this to a max would handle both negative zeros and NaNs
26582 // incorrectly, but we can swap the operands to fix both.
26583 std::swap(LHS, RHS);
26587 Opcode = X86ISD::FMAX;
26593 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
26596 EVT CondVT = Cond.getValueType();
26597 if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
26598 CondVT.getVectorElementType() == MVT::i1) {
26599 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
26600 // lowering on KNL. In this case we convert it to
26601 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
26602 // The same situation for all 128 and 256-bit vectors of i8 and i16.
26603 // Since SKX these selects have a proper lowering.
26604 EVT OpVT = LHS.getValueType();
26605 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
26606 (OpVT.getVectorElementType() == MVT::i8 ||
26607 OpVT.getVectorElementType() == MVT::i16) &&
26608 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
26609 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
26610 DCI.AddToWorklist(Cond.getNode());
26611 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
26614 // If this is a select between two integer constants, try to do some
26616 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
26617 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
26618 // Don't do this for crazy integer types.
26619 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
26620 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
26621 // so that TrueC (the true value) is larger than FalseC.
26622 bool NeedsCondInvert = false;
26624 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
26625 // Efficiently invertible.
26626 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
26627 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
26628 isa<ConstantSDNode>(Cond.getOperand(1))))) {
26629 NeedsCondInvert = true;
26630 std::swap(TrueC, FalseC);
26633 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
26634 if (FalseC->getAPIntValue() == 0 &&
26635 TrueC->getAPIntValue().isPowerOf2()) {
26636 if (NeedsCondInvert) // Invert the condition if needed.
26637 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26638 DAG.getConstant(1, DL, Cond.getValueType()));
26640 // Zero extend the condition if needed.
26641 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
26643 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
26644 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
26645 DAG.getConstant(ShAmt, DL, MVT::i8));
26648 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
26649 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
26650 if (NeedsCondInvert) // Invert the condition if needed.
26651 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26652 DAG.getConstant(1, DL, Cond.getValueType()));
26654 // Zero extend the condition if needed.
26655 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
26656 FalseC->getValueType(0), Cond);
26657 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26658 SDValue(FalseC, 0));
26661 // Optimize cases that will turn into an LEA instruction. This requires
26662 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
26663 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
26664 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
26665 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
26667 bool isFastMultiplier = false;
26669 switch ((unsigned char)Diff) {
26671 case 1: // result = add base, cond
26672 case 2: // result = lea base( , cond*2)
26673 case 3: // result = lea base(cond, cond*2)
26674 case 4: // result = lea base( , cond*4)
26675 case 5: // result = lea base(cond, cond*4)
26676 case 8: // result = lea base( , cond*8)
26677 case 9: // result = lea base(cond, cond*8)
26678 isFastMultiplier = true;
26683 if (isFastMultiplier) {
26684 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
26685 if (NeedsCondInvert) // Invert the condition if needed.
26686 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26687 DAG.getConstant(1, DL, Cond.getValueType()));
26689 // Zero extend the condition if needed.
26690 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
26692 // Scale the condition by the difference.
26694 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
26695 DAG.getConstant(Diff, DL,
26696 Cond.getValueType()));
26698 // Add the base if non-zero.
26699 if (FalseC->getAPIntValue() != 0)
26700 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26701 SDValue(FalseC, 0));
26708 // Canonicalize max and min:
26709 // (x > y) ? x : y -> (x >= y) ? x : y
26710 // (x < y) ? x : y -> (x <= y) ? x : y
26711 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
26712 // the need for an extra compare
26713 // against zero. e.g.
26714 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
26716 // testl %edi, %edi
26718 // cmovgl %edi, %eax
26722 // cmovsl %eax, %edi
26723 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
26724 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26725 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26726 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26731 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
26732 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
26733 Cond.getOperand(0), Cond.getOperand(1), NewCC);
26734 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
26739 // Early exit check
26740 if (!TLI.isTypeLegal(VT))
26743 // Match VSELECTs into subs with unsigned saturation.
26744 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
26745 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
26746 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
26747 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
26748 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26750 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
26751 // left side invert the predicate to simplify logic below.
26753 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
26755 CC = ISD::getSetCCInverse(CC, true);
26756 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
26760 if (Other.getNode() && Other->getNumOperands() == 2 &&
26761 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
26762 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
26763 SDValue CondRHS = Cond->getOperand(1);
26765 // Look for a general sub with unsigned saturation first.
26766 // x >= y ? x-y : 0 --> subus x, y
26767 // x > y ? x-y : 0 --> subus x, y
26768 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
26769 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
26770 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
26772 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
26773 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
26774 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
26775 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
26776 // If the RHS is a constant we have to reverse the const
26777 // canonicalization.
26778 // x > C-1 ? x+-C : 0 --> subus x, C
26779 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
26780 CondRHSConst->getAPIntValue() ==
26781 (-OpRHSConst->getAPIntValue() - 1))
26782 return DAG.getNode(
26783 X86ISD::SUBUS, DL, VT, OpLHS,
26784 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
26786 // Another special case: If C was a sign bit, the sub has been
26787 // canonicalized into a xor.
26788 // FIXME: Would it be better to use computeKnownBits to determine
26789 // whether it's safe to decanonicalize the xor?
26790 // x s< 0 ? x^C : 0 --> subus x, C
26791 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
26792 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
26793 OpRHSConst->getAPIntValue().isSignBit())
26794 // Note that we have to rebuild the RHS constant here to ensure we
26795 // don't rely on particular values of undef lanes.
26796 return DAG.getNode(
26797 X86ISD::SUBUS, DL, VT, OpLHS,
26798 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
26803 // Simplify vector selection if condition value type matches vselect
26805 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
26806 assert(Cond.getValueType().isVector() &&
26807 "vector select expects a vector selector!");
26809 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
26810 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
26812 // Try invert the condition if true value is not all 1s and false value
26814 if (!TValIsAllOnes && !FValIsAllZeros &&
26815 // Check if the selector will be produced by CMPP*/PCMP*
26816 Cond.getOpcode() == ISD::SETCC &&
26817 // Check if SETCC has already been promoted
26818 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
26820 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
26821 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
26823 if (TValIsAllZeros || FValIsAllOnes) {
26824 SDValue CC = Cond.getOperand(2);
26825 ISD::CondCode NewCC =
26826 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
26827 Cond.getOperand(0).getValueType().isInteger());
26828 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
26829 std::swap(LHS, RHS);
26830 TValIsAllOnes = FValIsAllOnes;
26831 FValIsAllZeros = TValIsAllZeros;
26835 if (TValIsAllOnes || FValIsAllZeros) {
26838 if (TValIsAllOnes && FValIsAllZeros)
26840 else if (TValIsAllOnes)
26842 DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
26843 else if (FValIsAllZeros)
26844 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
26845 DAG.getBitcast(CondVT, LHS));
26847 return DAG.getBitcast(VT, Ret);
26851 // If this is a *dynamic* select (non-constant condition) and we can match
26852 // this node with one of the variable blend instructions, restructure the
26853 // condition so that the blends can use the high bit of each element and use
26854 // SimplifyDemandedBits to simplify the condition operand.
26855 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
26856 !DCI.isBeforeLegalize() &&
26857 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
26858 unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
26860 // Don't optimize vector selects that map to mask-registers.
26864 // We can only handle the cases where VSELECT is directly legal on the
26865 // subtarget. We custom lower VSELECT nodes with constant conditions and
26866 // this makes it hard to see whether a dynamic VSELECT will correctly
26867 // lower, so we both check the operation's status and explicitly handle the
26868 // cases where a *dynamic* blend will fail even though a constant-condition
26869 // blend could be custom lowered.
26870 // FIXME: We should find a better way to handle this class of problems.
26871 // Potentially, we should combine constant-condition vselect nodes
26872 // pre-legalization into shuffles and not mark as many types as custom
26874 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
26876 // FIXME: We don't support i16-element blends currently. We could and
26877 // should support them by making *all* the bits in the condition be set
26878 // rather than just the high bit and using an i8-element blend.
26879 if (VT.getVectorElementType() == MVT::i16)
26881 // Dynamic blending was only available from SSE4.1 onward.
26882 if (VT.is128BitVector() && !Subtarget.hasSSE41())
26884 // Byte blends are only available in AVX2
26885 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
26888 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
26889 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
26891 APInt KnownZero, KnownOne;
26892 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
26893 DCI.isBeforeLegalizeOps());
26894 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
26895 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
26897 // If we changed the computation somewhere in the DAG, this change
26898 // will affect all users of Cond.
26899 // Make sure it is fine and update all the nodes so that we do not
26900 // use the generic VSELECT anymore. Otherwise, we may perform
26901 // wrong optimizations as we messed up with the actual expectation
26902 // for the vector boolean values.
26903 if (Cond != TLO.Old) {
26904 // Check all uses of that condition operand to check whether it will be
26905 // consumed by non-BLEND instructions, which may depend on all bits are
26907 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26909 if (I->getOpcode() != ISD::VSELECT)
26910 // TODO: Add other opcodes eventually lowered into BLEND.
26913 // Update all the users of the condition, before committing the change,
26914 // so that the VSELECT optimizations that expect the correct vector
26915 // boolean value will not be triggered.
26916 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26918 DAG.ReplaceAllUsesOfValueWith(
26920 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
26921 Cond, I->getOperand(1), I->getOperand(2)));
26922 DCI.CommitTargetLoweringOpt(TLO);
26925 // At this point, only Cond is changed. Change the condition
26926 // just for N to keep the opportunity to optimize all other
26927 // users their own way.
26928 DAG.ReplaceAllUsesOfValueWith(
26930 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
26931 TLO.New, N->getOperand(1), N->getOperand(2)));
26940 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
26942 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
26943 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
26944 /// Note that this is only legal for some op/cc combinations.
26945 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
26946 SelectionDAG &DAG) {
26947 // This combine only operates on CMP-like nodes.
26948 if (!(Cmp.getOpcode() == X86ISD::CMP ||
26949 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
26952 // This only applies to variations of the common case:
26953 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
26954 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
26955 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
26956 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
26957 // Using the proper condcodes (see below), overflow is checked for.
26959 // FIXME: We can generalize both constraints:
26960 // - XOR/OR/AND (if they were made to survive AtomicExpand)
26962 // if the result is compared.
26964 SDValue CmpLHS = Cmp.getOperand(0);
26965 SDValue CmpRHS = Cmp.getOperand(1);
26967 if (!CmpLHS.hasOneUse())
26970 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
26971 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
26974 const unsigned Opc = CmpLHS.getOpcode();
26976 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
26979 SDValue OpRHS = CmpLHS.getOperand(2);
26980 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
26984 APInt Addend = OpRHSC->getAPIntValue();
26985 if (Opc == ISD::ATOMIC_LOAD_SUB)
26988 if (CC == X86::COND_S && Addend == 1)
26990 else if (CC == X86::COND_NS && Addend == 1)
26992 else if (CC == X86::COND_G && Addend == -1)
26994 else if (CC == X86::COND_LE && Addend == -1)
26999 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
27000 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
27001 DAG.getUNDEF(CmpLHS.getValueType()));
27002 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
27006 // Check whether a boolean test is testing a boolean value generated by
27007 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
27010 // Simplify the following patterns:
27011 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
27012 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
27013 // to (Op EFLAGS Cond)
27015 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
27016 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
27017 // to (Op EFLAGS !Cond)
27019 // where Op could be BRCOND or CMOV.
27021 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
27022 // This combine only operates on CMP-like nodes.
27023 if (!(Cmp.getOpcode() == X86ISD::CMP ||
27024 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
27027 // Quit if not used as a boolean value.
27028 if (CC != X86::COND_E && CC != X86::COND_NE)
27031 // Check CMP operands. One of them should be 0 or 1 and the other should be
27032 // an SetCC or extended from it.
27033 SDValue Op1 = Cmp.getOperand(0);
27034 SDValue Op2 = Cmp.getOperand(1);
27037 const ConstantSDNode* C = nullptr;
27038 bool needOppositeCond = (CC == X86::COND_E);
27039 bool checkAgainstTrue = false; // Is it a comparison against 1?
27041 if ((C = dyn_cast<ConstantSDNode>(Op1)))
27043 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
27045 else // Quit if all operands are not constants.
27048 if (C->getZExtValue() == 1) {
27049 needOppositeCond = !needOppositeCond;
27050 checkAgainstTrue = true;
27051 } else if (C->getZExtValue() != 0)
27052 // Quit if the constant is neither 0 or 1.
27055 bool truncatedToBoolWithAnd = false;
27056 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
27057 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
27058 SetCC.getOpcode() == ISD::TRUNCATE ||
27059 SetCC.getOpcode() == ISD::AssertZext ||
27060 SetCC.getOpcode() == ISD::AND) {
27061 if (SetCC.getOpcode() == ISD::AND) {
27063 if (isOneConstant(SetCC.getOperand(0)))
27065 if (isOneConstant(SetCC.getOperand(1)))
27069 SetCC = SetCC.getOperand(OpIdx);
27070 truncatedToBoolWithAnd = true;
27072 SetCC = SetCC.getOperand(0);
27075 switch (SetCC.getOpcode()) {
27076 case X86ISD::SETCC_CARRY:
27077 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
27078 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
27079 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
27080 // truncated to i1 using 'and'.
27081 if (checkAgainstTrue && !truncatedToBoolWithAnd)
27083 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
27084 "Invalid use of SETCC_CARRY!");
27086 case X86ISD::SETCC:
27087 // Set the condition code or opposite one if necessary.
27088 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
27089 if (needOppositeCond)
27090 CC = X86::GetOppositeBranchCondition(CC);
27091 return SetCC.getOperand(1);
27092 case X86ISD::CMOV: {
27093 // Check whether false/true value has canonical one, i.e. 0 or 1.
27094 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
27095 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
27096 // Quit if true value is not a constant.
27099 // Quit if false value is not a constant.
27101 SDValue Op = SetCC.getOperand(0);
27102 // Skip 'zext' or 'trunc' node.
27103 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
27104 Op.getOpcode() == ISD::TRUNCATE)
27105 Op = Op.getOperand(0);
27106 // A special case for rdrand/rdseed, where 0 is set if false cond is
27108 if ((Op.getOpcode() != X86ISD::RDRAND &&
27109 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
27112 // Quit if false value is not the constant 0 or 1.
27113 bool FValIsFalse = true;
27114 if (FVal && FVal->getZExtValue() != 0) {
27115 if (FVal->getZExtValue() != 1)
27117 // If FVal is 1, opposite cond is needed.
27118 needOppositeCond = !needOppositeCond;
27119 FValIsFalse = false;
27121 // Quit if TVal is not the constant opposite of FVal.
27122 if (FValIsFalse && TVal->getZExtValue() != 1)
27124 if (!FValIsFalse && TVal->getZExtValue() != 0)
27126 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
27127 if (needOppositeCond)
27128 CC = X86::GetOppositeBranchCondition(CC);
27129 return SetCC.getOperand(3);
27136 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
27138 /// (X86or (X86setcc) (X86setcc))
27139 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
27140 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
27141 X86::CondCode &CC1, SDValue &Flags,
27143 if (Cond->getOpcode() == X86ISD::CMP) {
27144 if (!isNullConstant(Cond->getOperand(1)))
27147 Cond = Cond->getOperand(0);
27152 SDValue SetCC0, SetCC1;
27153 switch (Cond->getOpcode()) {
27154 default: return false;
27161 SetCC0 = Cond->getOperand(0);
27162 SetCC1 = Cond->getOperand(1);
27166 // Make sure we have SETCC nodes, using the same flags value.
27167 if (SetCC0.getOpcode() != X86ISD::SETCC ||
27168 SetCC1.getOpcode() != X86ISD::SETCC ||
27169 SetCC0->getOperand(1) != SetCC1->getOperand(1))
27172 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
27173 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
27174 Flags = SetCC0->getOperand(1);
27178 /// Optimize an EFLAGS definition used according to the condition code \p CC
27179 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
27180 /// uses of chain values.
27181 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
27182 SelectionDAG &DAG) {
27183 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
27185 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
27188 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
27189 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
27190 TargetLowering::DAGCombinerInfo &DCI,
27191 const X86Subtarget &Subtarget) {
27194 // If the flag operand isn't dead, don't touch this CMOV.
27195 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
27198 SDValue FalseOp = N->getOperand(0);
27199 SDValue TrueOp = N->getOperand(1);
27200 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
27201 SDValue Cond = N->getOperand(3);
27203 if (CC == X86::COND_E || CC == X86::COND_NE) {
27204 switch (Cond.getOpcode()) {
27208 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
27209 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
27210 return (CC == X86::COND_E) ? FalseOp : TrueOp;
27214 // Try to simplify the EFLAGS and condition code operands.
27215 // We can't always do this as FCMOV only supports a subset of X86 cond.
27216 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
27217 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
27218 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
27220 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27224 // If this is a select between two integer constants, try to do some
27225 // optimizations. Note that the operands are ordered the opposite of SELECT
27227 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
27228 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
27229 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
27230 // larger than FalseC (the false value).
27231 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
27232 CC = X86::GetOppositeBranchCondition(CC);
27233 std::swap(TrueC, FalseC);
27234 std::swap(TrueOp, FalseOp);
27237 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
27238 // This is efficient for any integer data type (including i8/i16) and
27240 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
27241 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27242 DAG.getConstant(CC, DL, MVT::i8), Cond);
27244 // Zero extend the condition if needed.
27245 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
27247 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
27248 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
27249 DAG.getConstant(ShAmt, DL, MVT::i8));
27250 if (N->getNumValues() == 2) // Dead flag value?
27251 return DCI.CombineTo(N, Cond, SDValue());
27255 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
27256 // for any integer data type, including i8/i16.
27257 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
27258 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27259 DAG.getConstant(CC, DL, MVT::i8), Cond);
27261 // Zero extend the condition if needed.
27262 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
27263 FalseC->getValueType(0), Cond);
27264 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27265 SDValue(FalseC, 0));
27267 if (N->getNumValues() == 2) // Dead flag value?
27268 return DCI.CombineTo(N, Cond, SDValue());
27272 // Optimize cases that will turn into an LEA instruction. This requires
27273 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
27274 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
27275 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
27276 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
27278 bool isFastMultiplier = false;
27280 switch ((unsigned char)Diff) {
27282 case 1: // result = add base, cond
27283 case 2: // result = lea base( , cond*2)
27284 case 3: // result = lea base(cond, cond*2)
27285 case 4: // result = lea base( , cond*4)
27286 case 5: // result = lea base(cond, cond*4)
27287 case 8: // result = lea base( , cond*8)
27288 case 9: // result = lea base(cond, cond*8)
27289 isFastMultiplier = true;
27294 if (isFastMultiplier) {
27295 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
27296 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27297 DAG.getConstant(CC, DL, MVT::i8), Cond);
27298 // Zero extend the condition if needed.
27299 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
27301 // Scale the condition by the difference.
27303 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
27304 DAG.getConstant(Diff, DL, Cond.getValueType()));
27306 // Add the base if non-zero.
27307 if (FalseC->getAPIntValue() != 0)
27308 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27309 SDValue(FalseC, 0));
27310 if (N->getNumValues() == 2) // Dead flag value?
27311 return DCI.CombineTo(N, Cond, SDValue());
27318 // Handle these cases:
27319 // (select (x != c), e, c) -> select (x != c), e, x),
27320 // (select (x == c), c, e) -> select (x == c), x, e)
27321 // where the c is an integer constant, and the "select" is the combination
27322 // of CMOV and CMP.
27324 // The rationale for this change is that the conditional-move from a constant
27325 // needs two instructions, however, conditional-move from a register needs
27326 // only one instruction.
27328 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
27329 // some instruction-combining opportunities. This opt needs to be
27330 // postponed as late as possible.
27332 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
27333 // the DCI.xxxx conditions are provided to postpone the optimization as
27334 // late as possible.
27336 ConstantSDNode *CmpAgainst = nullptr;
27337 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
27338 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
27339 !isa<ConstantSDNode>(Cond.getOperand(0))) {
27341 if (CC == X86::COND_NE &&
27342 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
27343 CC = X86::GetOppositeBranchCondition(CC);
27344 std::swap(TrueOp, FalseOp);
27347 if (CC == X86::COND_E &&
27348 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
27349 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
27350 DAG.getConstant(CC, DL, MVT::i8), Cond };
27351 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
27356 // Fold and/or of setcc's to double CMOV:
27357 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
27358 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
27360 // This combine lets us generate:
27361 // cmovcc1 (jcc1 if we don't have CMOV)
27367 // cmovne (jne if we don't have CMOV)
27368 // When we can't use the CMOV instruction, it might increase branch
27370 // When we can use CMOV, or when there is no mispredict, this improves
27371 // throughput and reduces register pressure.
27373 if (CC == X86::COND_NE) {
27375 X86::CondCode CC0, CC1;
27377 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
27379 std::swap(FalseOp, TrueOp);
27380 CC0 = X86::GetOppositeBranchCondition(CC0);
27381 CC1 = X86::GetOppositeBranchCondition(CC1);
27384 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
27386 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
27387 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
27388 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27389 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
27397 /// Different mul shrinking modes.
27398 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
27400 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
27401 EVT VT = N->getOperand(0).getValueType();
27402 if (VT.getScalarSizeInBits() != 32)
27405 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
27406 unsigned SignBits[2] = {1, 1};
27407 bool IsPositive[2] = {false, false};
27408 for (unsigned i = 0; i < 2; i++) {
27409 SDValue Opd = N->getOperand(i);
27411 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
27412 // compute signbits for it separately.
27413 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
27414 // For anyextend, it is safe to assume an appropriate number of leading
27416 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
27418 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
27423 IsPositive[i] = true;
27424 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
27425 // All the operands of BUILD_VECTOR need to be int constant.
27426 // Find the smallest value range which all the operands belong to.
27428 IsPositive[i] = true;
27429 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
27430 if (SubOp.isUndef())
27432 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
27435 APInt IntVal = CN->getAPIntValue();
27436 if (IntVal.isNegative())
27437 IsPositive[i] = false;
27438 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
27441 SignBits[i] = DAG.ComputeNumSignBits(Opd);
27442 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
27443 IsPositive[i] = true;
27447 bool AllPositive = IsPositive[0] && IsPositive[1];
27448 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
27449 // When ranges are from -128 ~ 127, use MULS8 mode.
27450 if (MinSignBits >= 25)
27452 // When ranges are from 0 ~ 255, use MULU8 mode.
27453 else if (AllPositive && MinSignBits >= 24)
27455 // When ranges are from -32768 ~ 32767, use MULS16 mode.
27456 else if (MinSignBits >= 17)
27458 // When ranges are from 0 ~ 65535, use MULU16 mode.
27459 else if (AllPositive && MinSignBits >= 16)
27466 /// When the operands of vector mul are extended from smaller size values,
27467 /// like i8 and i16, the type of mul may be shrinked to generate more
27468 /// efficient code. Two typical patterns are handled:
27470 /// %2 = sext/zext <N x i8> %1 to <N x i32>
27471 /// %4 = sext/zext <N x i8> %3 to <N x i32>
27472 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27473 /// %5 = mul <N x i32> %2, %4
27476 /// %2 = zext/sext <N x i16> %1 to <N x i32>
27477 /// %4 = zext/sext <N x i16> %3 to <N x i32>
27478 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27479 /// %5 = mul <N x i32> %2, %4
27481 /// There are four mul shrinking modes:
27482 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
27483 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
27484 /// generate pmullw+sext32 for it (MULS8 mode).
27485 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
27486 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
27487 /// generate pmullw+zext32 for it (MULU8 mode).
27488 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
27489 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
27490 /// generate pmullw+pmulhw for it (MULS16 mode).
27491 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
27492 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
27493 /// generate pmullw+pmulhuw for it (MULU16 mode).
27494 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
27495 const X86Subtarget &Subtarget) {
27496 // pmulld is supported since SSE41. It is better to use pmulld
27497 // instead of pmullw+pmulhw.
27498 if (Subtarget.hasSSE41())
27502 if (!canReduceVMulWidth(N, DAG, Mode))
27506 SDValue N0 = N->getOperand(0);
27507 SDValue N1 = N->getOperand(1);
27508 EVT VT = N->getOperand(0).getValueType();
27509 unsigned RegSize = 128;
27510 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
27512 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
27513 // Shrink the operands of mul.
27514 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
27515 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
27517 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
27518 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
27519 // lower part is needed.
27520 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
27521 if (Mode == MULU8 || Mode == MULS8) {
27522 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
27525 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27526 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
27527 // the higher part is also needed.
27528 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27529 ReducedVT, NewN0, NewN1);
27531 // Repack the lower part and higher part result of mul into a wider
27533 // Generate shuffle functioning as punpcklwd.
27534 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
27535 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27536 ShuffleMask[2 * i] = i;
27537 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
27540 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27541 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
27542 // Generate shuffle functioning as punpckhwd.
27543 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27544 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
27545 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
27548 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27549 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
27550 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
27553 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
27554 // to legalize the mul explicitly because implicit legalization for type
27555 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
27556 // instructions which will not exist when we explicitly legalize it by
27557 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
27558 // <4 x i16> undef).
27560 // Legalize the operands of mul.
27561 SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
27562 DAG.getUNDEF(ReducedVT));
27564 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27566 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27568 if (Mode == MULU8 || Mode == MULS8) {
27569 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
27571 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27573 // convert the type of mul result to VT.
27574 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27575 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
27576 : ISD::SIGN_EXTEND_VECTOR_INREG,
27578 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27579 DAG.getIntPtrConstant(0, DL));
27581 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
27582 // MULU16/MULS16, both parts are needed.
27583 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27584 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27585 OpsVT, NewN0, NewN1);
27587 // Repack the lower part and higher part result of mul into a wider
27588 // result. Make sure the type of mul result is VT.
27589 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27590 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
27591 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
27592 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27593 DAG.getIntPtrConstant(0, DL));
27598 /// Optimize a single multiply with constant into two operations in order to
27599 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
27600 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
27601 TargetLowering::DAGCombinerInfo &DCI,
27602 const X86Subtarget &Subtarget) {
27603 EVT VT = N->getValueType(0);
27604 if (DCI.isBeforeLegalize() && VT.isVector())
27605 return reduceVMULWidth(N, DAG, Subtarget);
27607 // An imul is usually smaller than the alternative sequence.
27608 if (DAG.getMachineFunction().getFunction()->optForMinSize())
27611 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
27614 if (VT != MVT::i64 && VT != MVT::i32)
27617 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
27620 uint64_t MulAmt = C->getZExtValue();
27621 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
27624 uint64_t MulAmt1 = 0;
27625 uint64_t MulAmt2 = 0;
27626 if ((MulAmt % 9) == 0) {
27628 MulAmt2 = MulAmt / 9;
27629 } else if ((MulAmt % 5) == 0) {
27631 MulAmt2 = MulAmt / 5;
27632 } else if ((MulAmt % 3) == 0) {
27634 MulAmt2 = MulAmt / 3;
27640 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
27642 if (isPowerOf2_64(MulAmt2) &&
27643 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
27644 // If second multiplifer is pow2, issue it first. We want the multiply by
27645 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
27647 std::swap(MulAmt1, MulAmt2);
27649 if (isPowerOf2_64(MulAmt1))
27650 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27651 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
27653 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
27654 DAG.getConstant(MulAmt1, DL, VT));
27656 if (isPowerOf2_64(MulAmt2))
27657 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
27658 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
27660 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
27661 DAG.getConstant(MulAmt2, DL, VT));
27665 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
27666 && "Both cases that could cause potential overflows should have "
27667 "already been handled.");
27668 if (isPowerOf2_64(MulAmt - 1))
27669 // (mul x, 2^N + 1) => (add (shl x, N), x)
27670 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
27671 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27672 DAG.getConstant(Log2_64(MulAmt - 1), DL,
27675 else if (isPowerOf2_64(MulAmt + 1))
27676 // (mul x, 2^N - 1) => (sub (shl x, N), x)
27677 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
27679 DAG.getConstant(Log2_64(MulAmt + 1),
27680 DL, MVT::i8)), N->getOperand(0));
27684 // Do not add new nodes to DAG combiner worklist.
27685 DCI.CombineTo(N, NewMul, false);
27690 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
27691 SDValue N0 = N->getOperand(0);
27692 SDValue N1 = N->getOperand(1);
27693 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
27694 EVT VT = N0.getValueType();
27696 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
27697 // since the result of setcc_c is all zero's or all ones.
27698 if (VT.isInteger() && !VT.isVector() &&
27699 N1C && N0.getOpcode() == ISD::AND &&
27700 N0.getOperand(1).getOpcode() == ISD::Constant) {
27701 SDValue N00 = N0.getOperand(0);
27702 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
27703 const APInt &ShAmt = N1C->getAPIntValue();
27704 Mask = Mask.shl(ShAmt);
27705 bool MaskOK = false;
27706 // We can handle cases concerning bit-widening nodes containing setcc_c if
27707 // we carefully interrogate the mask to make sure we are semantics
27709 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
27710 // of the underlying setcc_c operation if the setcc_c was zero extended.
27711 // Consider the following example:
27712 // zext(setcc_c) -> i32 0x0000FFFF
27713 // c1 -> i32 0x0000FFFF
27714 // c2 -> i32 0x00000001
27715 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
27716 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
27717 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
27719 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
27720 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27722 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
27723 N00.getOpcode() == ISD::ANY_EXTEND) &&
27724 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27725 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
27727 if (MaskOK && Mask != 0) {
27729 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
27733 // Hardware support for vector shifts is sparse which makes us scalarize the
27734 // vector operations in many cases. Also, on sandybridge ADD is faster than
27736 // (shl V, 1) -> add V,V
27737 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
27738 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
27739 assert(N0.getValueType().isVector() && "Invalid vector shift type");
27740 // We shift all of the values by one. In many cases we do not have
27741 // hardware support for this operation. This is better expressed as an ADD
27743 if (N1SplatC->getAPIntValue() == 1)
27744 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
27750 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
27751 SDValue N0 = N->getOperand(0);
27752 SDValue N1 = N->getOperand(1);
27753 EVT VT = N0.getValueType();
27754 unsigned Size = VT.getSizeInBits();
27756 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
27757 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
27758 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
27759 // depending on sign of (SarConst - [56,48,32,24,16])
27761 // sexts in X86 are MOVs. The MOVs have the same code size
27762 // as above SHIFTs (only SHIFT on 1 has lower code size).
27763 // However the MOVs have 2 advantages to a SHIFT:
27764 // 1. MOVs can write to a register that differs from source
27765 // 2. MOVs accept memory operands
27767 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
27768 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
27769 N0.getOperand(1).getOpcode() != ISD::Constant)
27772 SDValue N00 = N0.getOperand(0);
27773 SDValue N01 = N0.getOperand(1);
27774 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
27775 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
27776 EVT CVT = N1.getValueType();
27778 if (SarConst.isNegative())
27781 for (MVT SVT : MVT::integer_valuetypes()) {
27782 unsigned ShiftSize = SVT.getSizeInBits();
27783 // skipping types without corresponding sext/zext and
27784 // ShlConst that is not one of [56,48,32,24,16]
27785 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
27789 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
27790 SarConst = SarConst - (Size - ShiftSize);
27793 else if (SarConst.isNegative())
27794 return DAG.getNode(ISD::SHL, DL, VT, NN,
27795 DAG.getConstant(-SarConst, DL, CVT));
27797 return DAG.getNode(ISD::SRA, DL, VT, NN,
27798 DAG.getConstant(SarConst, DL, CVT));
27803 /// \brief Returns a vector of 0s if the node in input is a vector logical
27804 /// shift by a constant amount which is known to be bigger than or equal
27805 /// to the vector element size in bits.
27806 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
27807 const X86Subtarget &Subtarget) {
27808 EVT VT = N->getValueType(0);
27810 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
27811 (!Subtarget.hasInt256() ||
27812 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
27815 SDValue Amt = N->getOperand(1);
27817 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
27818 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
27819 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
27820 unsigned MaxAmount =
27821 VT.getSimpleVT().getVectorElementType().getSizeInBits();
27823 // SSE2/AVX2 logical shifts always return a vector of 0s
27824 // if the shift amount is bigger than or equal to
27825 // the element size. The constant shift amount will be
27826 // encoded as a 8-bit immediate.
27827 if (ShiftAmt.trunc(8).uge(MaxAmount))
27828 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
27834 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
27835 TargetLowering::DAGCombinerInfo &DCI,
27836 const X86Subtarget &Subtarget) {
27837 if (N->getOpcode() == ISD::SHL)
27838 if (SDValue V = combineShiftLeft(N, DAG))
27841 if (N->getOpcode() == ISD::SRA)
27842 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
27845 // Try to fold this logical shift into a zero vector.
27846 if (N->getOpcode() != ISD::SRA)
27847 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
27853 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
27854 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
27855 /// OR -> CMPNEQSS.
27856 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
27857 TargetLowering::DAGCombinerInfo &DCI,
27858 const X86Subtarget &Subtarget) {
27861 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
27862 // we're requiring SSE2 for both.
27863 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
27864 SDValue N0 = N->getOperand(0);
27865 SDValue N1 = N->getOperand(1);
27866 SDValue CMP0 = N0->getOperand(1);
27867 SDValue CMP1 = N1->getOperand(1);
27870 // The SETCCs should both refer to the same CMP.
27871 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
27874 SDValue CMP00 = CMP0->getOperand(0);
27875 SDValue CMP01 = CMP0->getOperand(1);
27876 EVT VT = CMP00.getValueType();
27878 if (VT == MVT::f32 || VT == MVT::f64) {
27879 bool ExpectingFlags = false;
27880 // Check for any users that want flags:
27881 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
27882 !ExpectingFlags && UI != UE; ++UI)
27883 switch (UI->getOpcode()) {
27888 ExpectingFlags = true;
27890 case ISD::CopyToReg:
27891 case ISD::SIGN_EXTEND:
27892 case ISD::ZERO_EXTEND:
27893 case ISD::ANY_EXTEND:
27897 if (!ExpectingFlags) {
27898 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
27899 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
27901 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
27902 X86::CondCode tmp = cc0;
27907 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
27908 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
27909 // FIXME: need symbolic constants for these magic numbers.
27910 // See X86ATTInstPrinter.cpp:printSSECC().
27911 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
27912 if (Subtarget.hasAVX512()) {
27913 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
27915 DAG.getConstant(x86cc, DL, MVT::i8));
27916 if (N->getValueType(0) != MVT::i1)
27917 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
27921 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
27922 CMP00.getValueType(), CMP00, CMP01,
27923 DAG.getConstant(x86cc, DL,
27926 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
27927 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
27929 if (is64BitFP && !Subtarget.is64Bit()) {
27930 // On a 32-bit target, we cannot bitcast the 64-bit float to a
27931 // 64-bit integer, since that's not a legal type. Since
27932 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
27933 // bits, but can do this little dance to extract the lowest 32 bits
27934 // and work with those going forward.
27935 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
27937 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
27938 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
27939 Vector32, DAG.getIntPtrConstant(0, DL));
27943 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
27944 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
27945 DAG.getConstant(1, DL, IntVT));
27946 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27948 return OneBitOfTruth;
27956 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
27957 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
27958 assert(N->getOpcode() == ISD::AND);
27960 EVT VT = N->getValueType(0);
27961 SDValue N0 = N->getOperand(0);
27962 SDValue N1 = N->getOperand(1);
27965 if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
27966 VT != MVT::v8i64 && VT != MVT::v16i32 &&
27967 VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
27970 // Canonicalize XOR to the left.
27971 if (N1.getOpcode() == ISD::XOR)
27974 if (N0.getOpcode() != ISD::XOR)
27977 SDValue N00 = N0->getOperand(0);
27978 SDValue N01 = N0->getOperand(1);
27980 N01 = peekThroughBitcasts(N01);
27982 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
27983 // insert_subvector building a 256-bit AllOnes vector.
27984 if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
27985 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
27988 SDValue V1 = N01->getOperand(0);
27989 SDValue V2 = N01->getOperand(1);
27990 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
27991 !V1.getOperand(0).isUndef() ||
27992 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
27993 !ISD::isBuildVectorAllOnes(V2.getNode()))
27996 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
27999 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
28000 // register. In most cases we actually compare or select YMM-sized registers
28001 // and mixing the two types creates horrible code. This method optimizes
28002 // some of the transition sequences.
28003 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
28004 TargetLowering::DAGCombinerInfo &DCI,
28005 const X86Subtarget &Subtarget) {
28006 EVT VT = N->getValueType(0);
28007 if (!VT.is256BitVector())
28010 assert((N->getOpcode() == ISD::ANY_EXTEND ||
28011 N->getOpcode() == ISD::ZERO_EXTEND ||
28012 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
28014 SDValue Narrow = N->getOperand(0);
28015 EVT NarrowVT = Narrow->getValueType(0);
28016 if (!NarrowVT.is128BitVector())
28019 if (Narrow->getOpcode() != ISD::XOR &&
28020 Narrow->getOpcode() != ISD::AND &&
28021 Narrow->getOpcode() != ISD::OR)
28024 SDValue N0 = Narrow->getOperand(0);
28025 SDValue N1 = Narrow->getOperand(1);
28028 // The Left side has to be a trunc.
28029 if (N0.getOpcode() != ISD::TRUNCATE)
28032 // The type of the truncated inputs.
28033 EVT WideVT = N0->getOperand(0)->getValueType(0);
28037 // The right side has to be a 'trunc' or a constant vector.
28038 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
28039 ConstantSDNode *RHSConstSplat = nullptr;
28040 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
28041 RHSConstSplat = RHSBV->getConstantSplatNode();
28042 if (!RHSTrunc && !RHSConstSplat)
28045 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28047 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
28050 // Set N0 and N1 to hold the inputs to the new wide operation.
28051 N0 = N0->getOperand(0);
28052 if (RHSConstSplat) {
28053 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
28054 SDValue(RHSConstSplat, 0));
28055 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
28056 } else if (RHSTrunc) {
28057 N1 = N1->getOperand(0);
28060 // Generate the wide operation.
28061 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
28062 unsigned Opcode = N->getOpcode();
28064 case ISD::ANY_EXTEND:
28066 case ISD::ZERO_EXTEND: {
28067 unsigned InBits = NarrowVT.getScalarSizeInBits();
28068 APInt Mask = APInt::getAllOnesValue(InBits);
28069 Mask = Mask.zext(VT.getScalarSizeInBits());
28070 return DAG.getNode(ISD::AND, DL, VT,
28071 Op, DAG.getConstant(Mask, DL, VT));
28073 case ISD::SIGN_EXTEND:
28074 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
28075 Op, DAG.getValueType(NarrowVT));
28077 llvm_unreachable("Unexpected opcode");
28081 static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
28082 TargetLowering::DAGCombinerInfo &DCI,
28083 const X86Subtarget &Subtarget) {
28084 SDValue N0 = N->getOperand(0);
28085 SDValue N1 = N->getOperand(1);
28088 // A vector zext_in_reg may be represented as a shuffle,
28089 // feeding into a bitcast (this represents anyext) feeding into
28090 // an and with a mask.
28091 // We'd like to try to combine that into a shuffle with zero
28092 // plus a bitcast, removing the and.
28093 if (N0.getOpcode() != ISD::BITCAST ||
28094 N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
28097 // The other side of the AND should be a splat of 2^C, where C
28098 // is the number of bits in the source type.
28099 N1 = peekThroughBitcasts(N1);
28100 if (N1.getOpcode() != ISD::BUILD_VECTOR)
28102 BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
28104 ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
28105 EVT SrcType = Shuffle->getValueType(0);
28107 // We expect a single-source shuffle
28108 if (!Shuffle->getOperand(1)->isUndef())
28111 unsigned SrcSize = SrcType.getScalarSizeInBits();
28112 unsigned NumElems = SrcType.getVectorNumElements();
28114 APInt SplatValue, SplatUndef;
28115 unsigned SplatBitSize;
28117 if (!Vector->isConstantSplat(SplatValue, SplatUndef,
28118 SplatBitSize, HasAnyUndefs))
28121 unsigned ResSize = N1.getValueType().getScalarSizeInBits();
28122 // Make sure the splat matches the mask we expect
28123 if (SplatBitSize > ResSize ||
28124 (SplatValue + 1).exactLogBase2() != (int)SrcSize)
28127 // Make sure the input and output size make sense
28128 if (SrcSize >= ResSize || ResSize % SrcSize)
28131 // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
28132 // The number of u's between each two values depends on the ratio between
28133 // the source and dest type.
28134 unsigned ZextRatio = ResSize / SrcSize;
28135 bool IsZext = true;
28136 for (unsigned i = 0; i != NumElems; ++i) {
28137 if (i % ZextRatio) {
28138 if (Shuffle->getMaskElt(i) > 0) {
28144 if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
28145 // Expected element number
28155 // Ok, perform the transformation - replace the shuffle with
28156 // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
28157 // (instead of undef) where the k elements come from the zero vector.
28158 SmallVector<int, 8> Mask;
28159 for (unsigned i = 0; i != NumElems; ++i)
28161 Mask.push_back(NumElems);
28163 Mask.push_back(i / ZextRatio);
28165 SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
28166 Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
28167 return DAG.getBitcast(N0.getValueType(), NewShuffle);
28170 /// If both input operands of a logic op are being cast from floating point
28171 /// types, try to convert this into a floating point logic node to avoid
28172 /// unnecessary moves from SSE to integer registers.
28173 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
28174 const X86Subtarget &Subtarget) {
28175 unsigned FPOpcode = ISD::DELETED_NODE;
28176 if (N->getOpcode() == ISD::AND)
28177 FPOpcode = X86ISD::FAND;
28178 else if (N->getOpcode() == ISD::OR)
28179 FPOpcode = X86ISD::FOR;
28180 else if (N->getOpcode() == ISD::XOR)
28181 FPOpcode = X86ISD::FXOR;
28183 assert(FPOpcode != ISD::DELETED_NODE &&
28184 "Unexpected input node for FP logic conversion");
28186 EVT VT = N->getValueType(0);
28187 SDValue N0 = N->getOperand(0);
28188 SDValue N1 = N->getOperand(1);
28190 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
28191 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
28192 (Subtarget.hasSSE2() && VT == MVT::i64))) {
28193 SDValue N00 = N0.getOperand(0);
28194 SDValue N10 = N1.getOperand(0);
28195 EVT N00Type = N00.getValueType();
28196 EVT N10Type = N10.getValueType();
28197 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
28198 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
28199 return DAG.getBitcast(VT, FPLogic);
28205 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
28206 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
28207 /// eliminate loading the vector constant mask value. This relies on the fact
28208 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
28209 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
28210 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
28211 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
28213 // TODO: Use AssertSext to mark any nodes that have the property of producing
28214 // all-ones or all-zeros. Then check for that node rather than particular
28216 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
28219 // The existence of the PCMP node guarantees that we have the required SSE2 or
28220 // AVX2 for a shift of this vector type, but there is no vector shift by
28221 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
28222 // masked compare nodes, so they should not make it here.
28223 EVT VT0 = Op0.getValueType();
28224 EVT VT1 = Op1.getValueType();
28225 unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
28226 if (VT0 != VT1 || EltBitWidth == 8)
28229 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
28232 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
28236 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
28237 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
28238 return DAG.getBitcast(N->getValueType(0), Shift);
28241 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
28242 TargetLowering::DAGCombinerInfo &DCI,
28243 const X86Subtarget &Subtarget) {
28244 if (DCI.isBeforeLegalizeOps())
28247 if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
28250 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28253 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28256 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
28259 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
28262 EVT VT = N->getValueType(0);
28263 SDValue N0 = N->getOperand(0);
28264 SDValue N1 = N->getOperand(1);
28267 // Create BEXTR instructions
28268 // BEXTR is ((X >> imm) & (2**size-1))
28269 if (VT != MVT::i32 && VT != MVT::i64)
28272 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
28274 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
28277 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
28278 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
28279 if (MaskNode && ShiftNode) {
28280 uint64_t Mask = MaskNode->getZExtValue();
28281 uint64_t Shift = ShiftNode->getZExtValue();
28282 if (isMask_64(Mask)) {
28283 uint64_t MaskSize = countPopulation(Mask);
28284 if (Shift + MaskSize <= VT.getSizeInBits())
28285 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
28286 DAG.getConstant(Shift | (MaskSize << 8), DL,
28294 // (or (and (m, y), (pandn m, x)))
28296 // (vselect m, x, y)
28297 // As a special case, try to fold:
28298 // (or (and (m, (sub 0, x)), (pandn m, x)))
28300 // (sub (xor X, M), M)
28301 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
28302 const X86Subtarget &Subtarget) {
28303 assert(N->getOpcode() == ISD::OR);
28305 SDValue N0 = N->getOperand(0);
28306 SDValue N1 = N->getOperand(1);
28307 EVT VT = N->getValueType(0);
28309 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
28311 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
28313 // Canonicalize pandn to RHS
28314 if (N0.getOpcode() == X86ISD::ANDNP)
28317 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
28320 SDValue Mask = N1.getOperand(0);
28321 SDValue X = N1.getOperand(1);
28323 if (N0.getOperand(0) == Mask)
28324 Y = N0.getOperand(1);
28325 if (N0.getOperand(1) == Mask)
28326 Y = N0.getOperand(0);
28328 // Check to see if the mask appeared in both the AND and ANDNP.
28332 // Validate that X, Y, and Mask are bitcasts, and see through them.
28333 Mask = peekThroughBitcasts(Mask);
28334 X = peekThroughBitcasts(X);
28335 Y = peekThroughBitcasts(Y);
28337 EVT MaskVT = Mask.getValueType();
28339 // Validate that the Mask operand is a vector sra node.
28340 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
28341 // there is no psrai.b
28342 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
28343 unsigned SraAmt = ~0;
28344 if (Mask.getOpcode() == ISD::SRA) {
28345 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
28346 if (auto *AmtConst = AmtBV->getConstantSplatNode())
28347 SraAmt = AmtConst->getZExtValue();
28348 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
28349 SDValue SraC = Mask.getOperand(1);
28350 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
28352 if ((SraAmt + 1) != EltBits)
28358 // (or (and (M, (sub 0, X)), (pandn M, X)))
28359 // which is a special case of vselect:
28360 // (vselect M, (sub 0, X), X)
28362 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
28363 // We know that, if fNegate is 0 or 1:
28364 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
28366 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
28367 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
28368 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
28369 // This lets us transform our vselect to:
28370 // (add (xor X, M), (and M, 1))
28372 // (sub (xor X, M), M)
28373 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
28374 auto IsNegV = [](SDNode *N, SDValue V) {
28375 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
28376 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
28379 if (IsNegV(Y.getNode(), X))
28381 else if (IsNegV(X.getNode(), Y))
28385 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
28386 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
28387 SDValue SubOp2 = Mask;
28389 // If the negate was on the false side of the select, then
28390 // the operands of the SUB need to be swapped. PR 27251.
28391 // This is because the pattern being matched above is
28392 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
28393 // but if the pattern matched was
28394 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
28395 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
28396 // pattern also needs to be a negation of the replacement pattern above.
28397 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
28398 // sub accomplishes the negation of the replacement pattern.
28400 std::swap(SubOp1, SubOp2);
28402 return DAG.getBitcast(VT,
28403 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
28407 // PBLENDVB is only available on SSE 4.1.
28408 if (!Subtarget.hasSSE41())
28411 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
28413 X = DAG.getBitcast(BlendVT, X);
28414 Y = DAG.getBitcast(BlendVT, Y);
28415 Mask = DAG.getBitcast(BlendVT, Mask);
28416 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
28417 return DAG.getBitcast(VT, Mask);
28420 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
28421 TargetLowering::DAGCombinerInfo &DCI,
28422 const X86Subtarget &Subtarget) {
28423 if (DCI.isBeforeLegalizeOps())
28426 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28429 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28432 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
28435 SDValue N0 = N->getOperand(0);
28436 SDValue N1 = N->getOperand(1);
28437 EVT VT = N->getValueType(0);
28439 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
28442 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
28443 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
28445 // SHLD/SHRD instructions have lower register pressure, but on some
28446 // platforms they have higher latency than the equivalent
28447 // series of shifts/or that would otherwise be generated.
28448 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
28449 // have higher latencies and we are not optimizing for size.
28450 if (!OptForSize && Subtarget.isSHLDSlow())
28453 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
28455 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
28457 if (!N0.hasOneUse() || !N1.hasOneUse())
28460 SDValue ShAmt0 = N0.getOperand(1);
28461 if (ShAmt0.getValueType() != MVT::i8)
28463 SDValue ShAmt1 = N1.getOperand(1);
28464 if (ShAmt1.getValueType() != MVT::i8)
28466 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
28467 ShAmt0 = ShAmt0.getOperand(0);
28468 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
28469 ShAmt1 = ShAmt1.getOperand(0);
28472 unsigned Opc = X86ISD::SHLD;
28473 SDValue Op0 = N0.getOperand(0);
28474 SDValue Op1 = N1.getOperand(0);
28475 if (ShAmt0.getOpcode() == ISD::SUB) {
28476 Opc = X86ISD::SHRD;
28477 std::swap(Op0, Op1);
28478 std::swap(ShAmt0, ShAmt1);
28481 unsigned Bits = VT.getSizeInBits();
28482 if (ShAmt1.getOpcode() == ISD::SUB) {
28483 SDValue Sum = ShAmt1.getOperand(0);
28484 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
28485 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
28486 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
28487 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
28488 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
28489 return DAG.getNode(Opc, DL, VT,
28491 DAG.getNode(ISD::TRUNCATE, DL,
28494 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
28495 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
28497 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
28498 return DAG.getNode(Opc, DL, VT,
28499 N0.getOperand(0), N1.getOperand(0),
28500 DAG.getNode(ISD::TRUNCATE, DL,
28507 // Generate NEG and CMOV for integer abs.
28508 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
28509 EVT VT = N->getValueType(0);
28511 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28512 // 8-bit integer abs to NEG and CMOV.
28513 if (VT.isInteger() && VT.getSizeInBits() == 8)
28516 SDValue N0 = N->getOperand(0);
28517 SDValue N1 = N->getOperand(1);
28520 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
28521 // and change it to SUB and CMOV.
28522 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
28523 N0.getOpcode() == ISD::ADD &&
28524 N0.getOperand(1) == N1 &&
28525 N1.getOpcode() == ISD::SRA &&
28526 N1.getOperand(0) == N0.getOperand(0))
28527 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
28528 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
28529 // Generate SUB & CMOV.
28530 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28531 DAG.getConstant(0, DL, VT), N0.getOperand(0));
28533 SDValue Ops[] = { N0.getOperand(0), Neg,
28534 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
28535 SDValue(Neg.getNode(), 1) };
28536 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
28541 /// Try to turn tests against the signbit in the form of:
28542 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
28545 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
28546 // This is only worth doing if the output type is i8 or i1.
28547 EVT ResultType = N->getValueType(0);
28548 if (ResultType != MVT::i8 && ResultType != MVT::i1)
28551 SDValue N0 = N->getOperand(0);
28552 SDValue N1 = N->getOperand(1);
28554 // We should be performing an xor against a truncated shift.
28555 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
28558 // Make sure we are performing an xor against one.
28559 if (!isOneConstant(N1))
28562 // SetCC on x86 zero extends so only act on this if it's a logical shift.
28563 SDValue Shift = N0.getOperand(0);
28564 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
28567 // Make sure we are truncating from one of i16, i32 or i64.
28568 EVT ShiftTy = Shift.getValueType();
28569 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
28572 // Make sure the shift amount extracts the sign bit.
28573 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
28574 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
28577 // Create a greater-than comparison against -1.
28578 // N.B. Using SETGE against 0 works but we want a canonical looking
28579 // comparison, using SETGT matches up with what TranslateX86CC.
28581 SDValue ShiftOp = Shift.getOperand(0);
28582 EVT ShiftOpTy = ShiftOp.getValueType();
28583 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28584 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
28585 *DAG.getContext(), ResultType);
28586 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
28587 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
28588 if (SetCCResultType != ResultType)
28589 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
28593 /// Turn vector tests of the signbit in the form of:
28594 /// xor (sra X, elt_size(X)-1), -1
28598 /// This should be called before type legalization because the pattern may not
28599 /// persist after that.
28600 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
28601 const X86Subtarget &Subtarget) {
28602 EVT VT = N->getValueType(0);
28603 if (!VT.isSimple())
28606 switch (VT.getSimpleVT().SimpleTy) {
28607 default: return SDValue();
28610 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
28611 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
28615 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
28618 // There must be a shift right algebraic before the xor, and the xor must be a
28619 // 'not' operation.
28620 SDValue Shift = N->getOperand(0);
28621 SDValue Ones = N->getOperand(1);
28622 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
28623 !ISD::isBuildVectorAllOnes(Ones.getNode()))
28626 // The shift should be smearing the sign bit across each vector element.
28627 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
28631 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
28632 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
28633 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
28636 // Create a greater-than comparison against -1. We don't use the more obvious
28637 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
28638 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
28641 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
28642 TargetLowering::DAGCombinerInfo &DCI,
28643 const X86Subtarget &Subtarget) {
28644 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
28647 if (DCI.isBeforeLegalizeOps())
28650 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
28653 if (Subtarget.hasCMov())
28654 if (SDValue RV = combineIntegerAbs(N, DAG))
28657 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28663 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
28664 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
28665 /// X86ISD::AVG instruction.
28666 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
28667 const X86Subtarget &Subtarget,
28669 if (!VT.isVector() || !VT.isSimple())
28671 EVT InVT = In.getValueType();
28672 unsigned NumElems = VT.getVectorNumElements();
28674 EVT ScalarVT = VT.getVectorElementType();
28675 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
28676 isPowerOf2_32(NumElems)))
28679 // InScalarVT is the intermediate type in AVG pattern and it should be greater
28680 // than the original input type (i8/i16).
28681 EVT InScalarVT = InVT.getVectorElementType();
28682 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
28685 if (!Subtarget.hasSSE2())
28687 if (Subtarget.hasAVX512()) {
28688 if (VT.getSizeInBits() > 512)
28690 } else if (Subtarget.hasAVX2()) {
28691 if (VT.getSizeInBits() > 256)
28694 if (VT.getSizeInBits() > 128)
28698 // Detect the following pattern:
28700 // %1 = zext <N x i8> %a to <N x i32>
28701 // %2 = zext <N x i8> %b to <N x i32>
28702 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
28703 // %4 = add nuw nsw <N x i32> %3, %2
28704 // %5 = lshr <N x i32> %N, <i32 1 x N>
28705 // %6 = trunc <N x i32> %5 to <N x i8>
28707 // In AVX512, the last instruction can also be a trunc store.
28709 if (In.getOpcode() != ISD::SRL)
28712 // A lambda checking the given SDValue is a constant vector and each element
28713 // is in the range [Min, Max].
28714 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
28715 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
28716 if (!BV || !BV->isConstant())
28718 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
28719 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
28722 uint64_t Val = C->getZExtValue();
28723 if (Val < Min || Val > Max)
28729 // Check if each element of the vector is left-shifted by one.
28730 auto LHS = In.getOperand(0);
28731 auto RHS = In.getOperand(1);
28732 if (!IsConstVectorInRange(RHS, 1, 1))
28734 if (LHS.getOpcode() != ISD::ADD)
28737 // Detect a pattern of a + b + 1 where the order doesn't matter.
28738 SDValue Operands[3];
28739 Operands[0] = LHS.getOperand(0);
28740 Operands[1] = LHS.getOperand(1);
28742 // Take care of the case when one of the operands is a constant vector whose
28743 // element is in the range [1, 256].
28744 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
28745 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
28746 Operands[0].getOperand(0).getValueType() == VT) {
28747 // The pattern is detected. Subtract one from the constant vector, then
28748 // demote it and emit X86ISD::AVG instruction.
28749 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
28750 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
28751 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
28752 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28756 if (Operands[0].getOpcode() == ISD::ADD)
28757 std::swap(Operands[0], Operands[1]);
28758 else if (Operands[1].getOpcode() != ISD::ADD)
28760 Operands[2] = Operands[1].getOperand(0);
28761 Operands[1] = Operands[1].getOperand(1);
28763 // Now we have three operands of two additions. Check that one of them is a
28764 // constant vector with ones, and the other two are promoted from i8/i16.
28765 for (int i = 0; i < 3; ++i) {
28766 if (!IsConstVectorInRange(Operands[i], 1, 1))
28768 std::swap(Operands[i], Operands[2]);
28770 // Check if Operands[0] and Operands[1] are results of type promotion.
28771 for (int j = 0; j < 2; ++j)
28772 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
28773 Operands[j].getOperand(0).getValueType() != VT)
28776 // The pattern is detected, emit X86ISD::AVG instruction.
28777 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28778 Operands[1].getOperand(0));
28784 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
28785 TargetLowering::DAGCombinerInfo &DCI,
28786 const X86Subtarget &Subtarget) {
28787 LoadSDNode *Ld = cast<LoadSDNode>(N);
28788 EVT RegVT = Ld->getValueType(0);
28789 EVT MemVT = Ld->getMemoryVT();
28791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28793 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
28794 // into two 16-byte operations.
28795 ISD::LoadExtType Ext = Ld->getExtensionType();
28797 unsigned AddressSpace = Ld->getAddressSpace();
28798 unsigned Alignment = Ld->getAlignment();
28799 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
28800 Ext == ISD::NON_EXTLOAD &&
28801 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
28802 AddressSpace, Alignment, &Fast) && !Fast) {
28803 unsigned NumElems = RegVT.getVectorNumElements();
28807 SDValue Ptr = Ld->getBasePtr();
28809 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
28812 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
28813 Alignment, Ld->getMemOperand()->getFlags());
28815 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
28817 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
28818 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
28819 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
28821 Load2.getValue(1));
28823 SDValue NewVec = DAG.getUNDEF(RegVT);
28824 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
28825 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
28826 return DCI.CombineTo(N, NewVec, TF, true);
28832 /// If V is a build vector of boolean constants and exactly one of those
28833 /// constants is true, return the operand index of that true element.
28834 /// Otherwise, return -1.
28835 static int getOneTrueElt(SDValue V) {
28836 // This needs to be a build vector of booleans.
28837 // TODO: Checking for the i1 type matches the IR definition for the mask,
28838 // but the mask check could be loosened to i8 or other types. That might
28839 // also require checking more than 'allOnesValue'; eg, the x86 HW
28840 // instructions only require that the MSB is set for each mask element.
28841 // The ISD::MSTORE comments/definition do not specify how the mask operand
28843 auto *BV = dyn_cast<BuildVectorSDNode>(V);
28844 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
28847 int TrueIndex = -1;
28848 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
28849 for (unsigned i = 0; i < NumElts; ++i) {
28850 const SDValue &Op = BV->getOperand(i);
28853 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
28856 if (ConstNode->getAPIntValue().isAllOnesValue()) {
28857 // If we already found a one, this is too many.
28858 if (TrueIndex >= 0)
28866 /// Given a masked memory load/store operation, return true if it has one mask
28867 /// bit set. If it has one mask bit set, then also return the memory address of
28868 /// the scalar element to load/store, the vector index to insert/extract that
28869 /// scalar element, and the alignment for the scalar memory access.
28870 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
28871 SelectionDAG &DAG, SDValue &Addr,
28872 SDValue &Index, unsigned &Alignment) {
28873 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
28874 if (TrueMaskElt < 0)
28877 // Get the address of the one scalar element that is specified by the mask
28878 // using the appropriate offset from the base pointer.
28879 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
28880 Addr = MaskedOp->getBasePtr();
28881 if (TrueMaskElt != 0) {
28882 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
28883 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
28886 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
28887 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
28891 /// If exactly one element of the mask is set for a non-extending masked load,
28892 /// it is a scalar load and vector insert.
28893 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
28894 /// mask have already been optimized in IR, so we don't bother with those here.
28896 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28897 TargetLowering::DAGCombinerInfo &DCI) {
28898 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
28899 // However, some target hooks may need to be added to know when the transform
28900 // is profitable. Endianness would also have to be considered.
28902 SDValue Addr, VecIndex;
28903 unsigned Alignment;
28904 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
28907 // Load the one scalar element that is specified by the mask using the
28908 // appropriate offset from the base pointer.
28910 EVT VT = ML->getValueType(0);
28911 EVT EltVT = VT.getVectorElementType();
28913 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
28914 Alignment, ML->getMemOperand()->getFlags());
28916 // Insert the loaded element into the appropriate place in the vector.
28917 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
28919 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
28923 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28924 TargetLowering::DAGCombinerInfo &DCI) {
28925 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
28929 EVT VT = ML->getValueType(0);
28931 // If we are loading the first and last elements of a vector, it is safe and
28932 // always faster to load the whole vector. Replace the masked load with a
28933 // vector load and select.
28934 unsigned NumElts = VT.getVectorNumElements();
28935 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
28936 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
28937 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
28938 if (LoadFirstElt && LoadLastElt) {
28939 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28940 ML->getMemOperand());
28941 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
28942 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
28945 // Convert a masked load with a constant mask into a masked load and a select.
28946 // This allows the select operation to use a faster kind of select instruction
28947 // (for example, vblendvps -> vblendps).
28949 // Don't try this if the pass-through operand is already undefined. That would
28950 // cause an infinite loop because that's what we're about to create.
28951 if (ML->getSrc0().isUndef())
28954 // The new masked load has an undef pass-through operand. The select uses the
28955 // original pass-through operand.
28956 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28957 ML->getMask(), DAG.getUNDEF(VT),
28958 ML->getMemoryVT(), ML->getMemOperand(),
28959 ML->getExtensionType());
28960 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
28962 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
28965 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
28966 TargetLowering::DAGCombinerInfo &DCI,
28967 const X86Subtarget &Subtarget) {
28968 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
28969 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
28970 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
28972 // TODO: Do some AVX512 subsets benefit from this transform?
28973 if (!Subtarget.hasAVX512())
28974 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
28978 if (Mld->getExtensionType() != ISD::SEXTLOAD)
28981 // Resolve extending loads.
28982 EVT VT = Mld->getValueType(0);
28983 unsigned NumElems = VT.getVectorNumElements();
28984 EVT LdVT = Mld->getMemoryVT();
28987 assert(LdVT != VT && "Cannot extend to the same type");
28988 unsigned ToSz = VT.getVectorElementType().getSizeInBits();
28989 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
28990 // From/To sizes and ElemCount must be pow of two.
28991 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
28992 "Unexpected size for extending masked load");
28994 unsigned SizeRatio = ToSz / FromSz;
28995 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
28997 // Create a type on which we perform the shuffle.
28998 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
28999 LdVT.getScalarType(), NumElems*SizeRatio);
29000 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29002 // Convert Src0 value.
29003 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
29004 if (!Mld->getSrc0().isUndef()) {
29005 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29006 for (unsigned i = 0; i != NumElems; ++i)
29007 ShuffleVec[i] = i * SizeRatio;
29009 // Can't shuffle using an illegal type.
29010 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29011 "WideVecVT should be legal");
29012 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
29013 DAG.getUNDEF(WideVecVT), ShuffleVec);
29015 // Prepare the new mask.
29017 SDValue Mask = Mld->getMask();
29018 if (Mask.getValueType() == VT) {
29019 // Mask and original value have the same type.
29020 NewMask = DAG.getBitcast(WideVecVT, Mask);
29021 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29022 for (unsigned i = 0; i != NumElems; ++i)
29023 ShuffleVec[i] = i * SizeRatio;
29024 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
29025 ShuffleVec[i] = NumElems * SizeRatio;
29026 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29027 DAG.getConstant(0, dl, WideVecVT),
29030 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29031 unsigned WidenNumElts = NumElems*SizeRatio;
29032 unsigned MaskNumElts = VT.getVectorNumElements();
29033 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
29036 unsigned NumConcat = WidenNumElts / MaskNumElts;
29037 SmallVector<SDValue, 16> Ops(NumConcat);
29038 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29040 for (unsigned i = 1; i != NumConcat; ++i)
29043 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29046 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
29047 Mld->getBasePtr(), NewMask, WideSrc0,
29048 Mld->getMemoryVT(), Mld->getMemOperand(),
29050 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
29051 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
29054 /// If exactly one element of the mask is set for a non-truncating masked store,
29055 /// it is a vector extract and scalar store.
29056 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
29057 /// mask have already been optimized in IR, so we don't bother with those here.
29058 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
29059 SelectionDAG &DAG) {
29060 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
29061 // However, some target hooks may need to be added to know when the transform
29062 // is profitable. Endianness would also have to be considered.
29064 SDValue Addr, VecIndex;
29065 unsigned Alignment;
29066 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
29069 // Extract the one scalar element that is actually being stored.
29071 EVT VT = MS->getValue().getValueType();
29072 EVT EltVT = VT.getVectorElementType();
29073 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
29074 MS->getValue(), VecIndex);
29076 // Store that element at the appropriate offset from the base pointer.
29077 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
29078 Alignment, MS->getMemOperand()->getFlags());
29081 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
29082 const X86Subtarget &Subtarget) {
29083 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
29084 if (!Mst->isTruncatingStore())
29085 return reduceMaskedStoreToScalarStore(Mst, DAG);
29087 // Resolve truncating stores.
29088 EVT VT = Mst->getValue().getValueType();
29089 unsigned NumElems = VT.getVectorNumElements();
29090 EVT StVT = Mst->getMemoryVT();
29093 assert(StVT != VT && "Cannot truncate to the same type");
29094 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29095 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29097 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29099 // The truncating store is legal in some cases. For example
29100 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29101 // are designated for truncate store.
29102 // In this case we don't need any further transformations.
29103 if (TLI.isTruncStoreLegal(VT, StVT))
29106 // From/To sizes and ElemCount must be pow of two.
29107 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
29108 "Unexpected size for truncating masked store");
29109 // We are going to use the original vector elt for storing.
29110 // Accumulated smaller vector elements must be a multiple of the store size.
29111 assert (((NumElems * FromSz) % ToSz) == 0 &&
29112 "Unexpected ratio for truncating masked store");
29114 unsigned SizeRatio = FromSz / ToSz;
29115 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29117 // Create a type on which we perform the shuffle.
29118 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29119 StVT.getScalarType(), NumElems*SizeRatio);
29121 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29123 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
29124 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29125 for (unsigned i = 0; i != NumElems; ++i)
29126 ShuffleVec[i] = i * SizeRatio;
29128 // Can't shuffle using an illegal type.
29129 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29130 "WideVecVT should be legal");
29132 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29133 DAG.getUNDEF(WideVecVT),
29137 SDValue Mask = Mst->getMask();
29138 if (Mask.getValueType() == VT) {
29139 // Mask and original value have the same type.
29140 NewMask = DAG.getBitcast(WideVecVT, Mask);
29141 for (unsigned i = 0; i != NumElems; ++i)
29142 ShuffleVec[i] = i * SizeRatio;
29143 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
29144 ShuffleVec[i] = NumElems*SizeRatio;
29145 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29146 DAG.getConstant(0, dl, WideVecVT),
29149 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29150 unsigned WidenNumElts = NumElems*SizeRatio;
29151 unsigned MaskNumElts = VT.getVectorNumElements();
29152 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
29155 unsigned NumConcat = WidenNumElts / MaskNumElts;
29156 SmallVector<SDValue, 16> Ops(NumConcat);
29157 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29159 for (unsigned i = 1; i != NumConcat; ++i)
29162 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29165 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
29166 Mst->getBasePtr(), NewMask, StVT,
29167 Mst->getMemOperand(), false);
29170 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
29171 const X86Subtarget &Subtarget) {
29172 StoreSDNode *St = cast<StoreSDNode>(N);
29173 EVT VT = St->getValue().getValueType();
29174 EVT StVT = St->getMemoryVT();
29176 SDValue StoredVal = St->getOperand(1);
29177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29179 // If we are saving a concatenation of two XMM registers and 32-byte stores
29180 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
29182 unsigned AddressSpace = St->getAddressSpace();
29183 unsigned Alignment = St->getAlignment();
29184 if (VT.is256BitVector() && StVT == VT &&
29185 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
29186 AddressSpace, Alignment, &Fast) &&
29188 unsigned NumElems = VT.getVectorNumElements();
29192 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
29193 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
29195 SDValue Ptr0 = St->getBasePtr();
29196 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
29199 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
29200 Alignment, St->getMemOperand()->getFlags());
29202 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
29203 std::min(16U, Alignment), St->getMemOperand()->getFlags());
29204 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
29207 // Optimize trunc store (of multiple scalars) to shuffle and store.
29208 // First, pack all of the elements in one place. Next, store to memory
29209 // in fewer chunks.
29210 if (St->isTruncatingStore() && VT.isVector()) {
29211 // Check if we can detect an AVG pattern from the truncation. If yes,
29212 // replace the trunc store by a normal store with the result of X86ISD::AVG
29214 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
29216 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
29217 St->getPointerInfo(), St->getAlignment(),
29218 St->getMemOperand()->getFlags());
29220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29221 unsigned NumElems = VT.getVectorNumElements();
29222 assert(StVT != VT && "Cannot truncate to the same type");
29223 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29224 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29226 // The truncating store is legal in some cases. For example
29227 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29228 // are designated for truncate store.
29229 // In this case we don't need any further transformations.
29230 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
29233 // From, To sizes and ElemCount must be pow of two
29234 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
29235 // We are going to use the original vector elt for storing.
29236 // Accumulated smaller vector elements must be a multiple of the store size.
29237 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
29239 unsigned SizeRatio = FromSz / ToSz;
29241 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29243 // Create a type on which we perform the shuffle
29244 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29245 StVT.getScalarType(), NumElems*SizeRatio);
29247 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29249 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
29250 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
29251 for (unsigned i = 0; i != NumElems; ++i)
29252 ShuffleVec[i] = i * SizeRatio;
29254 // Can't shuffle using an illegal type.
29255 if (!TLI.isTypeLegal(WideVecVT))
29258 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29259 DAG.getUNDEF(WideVecVT),
29261 // At this point all of the data is stored at the bottom of the
29262 // register. We now need to save it to mem.
29264 // Find the largest store unit
29265 MVT StoreType = MVT::i8;
29266 for (MVT Tp : MVT::integer_valuetypes()) {
29267 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
29271 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
29272 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
29273 (64 <= NumElems * ToSz))
29274 StoreType = MVT::f64;
29276 // Bitcast the original vector into a vector of store-size units
29277 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
29278 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
29279 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
29280 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
29281 SmallVector<SDValue, 8> Chains;
29282 SDValue Ptr = St->getBasePtr();
29284 // Perform one or more big stores into memory.
29285 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
29286 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
29287 StoreType, ShuffWide,
29288 DAG.getIntPtrConstant(i, dl));
29290 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
29291 St->getAlignment(), St->getMemOperand()->getFlags());
29292 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
29293 Chains.push_back(Ch);
29296 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
29299 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
29300 // the FP state in cases where an emms may be missing.
29301 // A preferable solution to the general problem is to figure out the right
29302 // places to insert EMMS. This qualifies as a quick hack.
29304 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
29305 if (VT.getSizeInBits() != 64)
29308 const Function *F = DAG.getMachineFunction().getFunction();
29309 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
29311 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
29312 if ((VT.isVector() ||
29313 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
29314 isa<LoadSDNode>(St->getValue()) &&
29315 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
29316 St->getChain().hasOneUse() && !St->isVolatile()) {
29317 SDNode* LdVal = St->getValue().getNode();
29318 LoadSDNode *Ld = nullptr;
29319 int TokenFactorIndex = -1;
29320 SmallVector<SDValue, 8> Ops;
29321 SDNode* ChainVal = St->getChain().getNode();
29322 // Must be a store of a load. We currently handle two cases: the load
29323 // is a direct child, and it's under an intervening TokenFactor. It is
29324 // possible to dig deeper under nested TokenFactors.
29325 if (ChainVal == LdVal)
29326 Ld = cast<LoadSDNode>(St->getChain());
29327 else if (St->getValue().hasOneUse() &&
29328 ChainVal->getOpcode() == ISD::TokenFactor) {
29329 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
29330 if (ChainVal->getOperand(i).getNode() == LdVal) {
29331 TokenFactorIndex = i;
29332 Ld = cast<LoadSDNode>(St->getValue());
29334 Ops.push_back(ChainVal->getOperand(i));
29338 if (!Ld || !ISD::isNormalLoad(Ld))
29341 // If this is not the MMX case, i.e. we are just turning i64 load/store
29342 // into f64 load/store, avoid the transformation if there are multiple
29343 // uses of the loaded value.
29344 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
29349 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
29350 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
29352 if (Subtarget.is64Bit() || F64IsLegal) {
29353 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
29354 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
29355 Ld->getPointerInfo(), Ld->getAlignment(),
29356 Ld->getMemOperand()->getFlags());
29357 SDValue NewChain = NewLd.getValue(1);
29358 if (TokenFactorIndex >= 0) {
29359 Ops.push_back(NewChain);
29360 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29362 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
29363 St->getPointerInfo(), St->getAlignment(),
29364 St->getMemOperand()->getFlags());
29367 // Otherwise, lower to two pairs of 32-bit loads / stores.
29368 SDValue LoAddr = Ld->getBasePtr();
29369 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
29371 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
29372 Ld->getPointerInfo(), Ld->getAlignment(),
29373 Ld->getMemOperand()->getFlags());
29374 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
29375 Ld->getPointerInfo().getWithOffset(4),
29376 MinAlign(Ld->getAlignment(), 4),
29377 Ld->getMemOperand()->getFlags());
29379 SDValue NewChain = LoLd.getValue(1);
29380 if (TokenFactorIndex >= 0) {
29381 Ops.push_back(LoLd);
29382 Ops.push_back(HiLd);
29383 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29386 LoAddr = St->getBasePtr();
29387 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
29390 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
29391 St->getAlignment(), St->getMemOperand()->getFlags());
29392 SDValue HiSt = DAG.getStore(
29393 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
29394 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
29395 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
29398 // This is similar to the above case, but here we handle a scalar 64-bit
29399 // integer store that is extracted from a vector on a 32-bit target.
29400 // If we have SSE2, then we can treat it like a floating-point double
29401 // to get past legalization. The execution dependencies fixup pass will
29402 // choose the optimal machine instruction for the store if this really is
29403 // an integer or v2f32 rather than an f64.
29404 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
29405 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
29406 SDValue OldExtract = St->getOperand(1);
29407 SDValue ExtOp0 = OldExtract.getOperand(0);
29408 unsigned VecSize = ExtOp0.getValueSizeInBits();
29409 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
29410 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
29411 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
29412 BitCast, OldExtract.getOperand(1));
29413 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
29414 St->getPointerInfo(), St->getAlignment(),
29415 St->getMemOperand()->getFlags());
29421 /// Return 'true' if this vector operation is "horizontal"
29422 /// and return the operands for the horizontal operation in LHS and RHS. A
29423 /// horizontal operation performs the binary operation on successive elements
29424 /// of its first operand, then on successive elements of its second operand,
29425 /// returning the resulting values in a vector. For example, if
29426 /// A = < float a0, float a1, float a2, float a3 >
29428 /// B = < float b0, float b1, float b2, float b3 >
29429 /// then the result of doing a horizontal operation on A and B is
29430 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
29431 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
29432 /// A horizontal-op B, for some already available A and B, and if so then LHS is
29433 /// set to A, RHS to B, and the routine returns 'true'.
29434 /// Note that the binary operation should have the property that if one of the
29435 /// operands is UNDEF then the result is UNDEF.
29436 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
29437 // Look for the following pattern: if
29438 // A = < float a0, float a1, float a2, float a3 >
29439 // B = < float b0, float b1, float b2, float b3 >
29441 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
29442 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
29443 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
29444 // which is A horizontal-op B.
29446 // At least one of the operands should be a vector shuffle.
29447 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
29448 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
29451 MVT VT = LHS.getSimpleValueType();
29453 assert((VT.is128BitVector() || VT.is256BitVector()) &&
29454 "Unsupported vector type for horizontal add/sub");
29456 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
29457 // operate independently on 128-bit lanes.
29458 unsigned NumElts = VT.getVectorNumElements();
29459 unsigned NumLanes = VT.getSizeInBits()/128;
29460 unsigned NumLaneElts = NumElts / NumLanes;
29461 assert((NumLaneElts % 2 == 0) &&
29462 "Vector type should have an even number of elements in each lane");
29463 unsigned HalfLaneElts = NumLaneElts/2;
29465 // View LHS in the form
29466 // LHS = VECTOR_SHUFFLE A, B, LMask
29467 // If LHS is not a shuffle then pretend it is the shuffle
29468 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
29469 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
29472 SmallVector<int, 16> LMask(NumElts);
29473 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29474 if (!LHS.getOperand(0).isUndef())
29475 A = LHS.getOperand(0);
29476 if (!LHS.getOperand(1).isUndef())
29477 B = LHS.getOperand(1);
29478 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
29479 std::copy(Mask.begin(), Mask.end(), LMask.begin());
29481 if (!LHS.isUndef())
29483 for (unsigned i = 0; i != NumElts; ++i)
29487 // Likewise, view RHS in the form
29488 // RHS = VECTOR_SHUFFLE C, D, RMask
29490 SmallVector<int, 16> RMask(NumElts);
29491 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29492 if (!RHS.getOperand(0).isUndef())
29493 C = RHS.getOperand(0);
29494 if (!RHS.getOperand(1).isUndef())
29495 D = RHS.getOperand(1);
29496 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
29497 std::copy(Mask.begin(), Mask.end(), RMask.begin());
29499 if (!RHS.isUndef())
29501 for (unsigned i = 0; i != NumElts; ++i)
29505 // Check that the shuffles are both shuffling the same vectors.
29506 if (!(A == C && B == D) && !(A == D && B == C))
29509 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
29510 if (!A.getNode() && !B.getNode())
29513 // If A and B occur in reverse order in RHS, then "swap" them (which means
29514 // rewriting the mask).
29516 ShuffleVectorSDNode::commuteMask(RMask);
29518 // At this point LHS and RHS are equivalent to
29519 // LHS = VECTOR_SHUFFLE A, B, LMask
29520 // RHS = VECTOR_SHUFFLE A, B, RMask
29521 // Check that the masks correspond to performing a horizontal operation.
29522 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
29523 for (unsigned i = 0; i != NumLaneElts; ++i) {
29524 int LIdx = LMask[i+l], RIdx = RMask[i+l];
29526 // Ignore any UNDEF components.
29527 if (LIdx < 0 || RIdx < 0 ||
29528 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
29529 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
29532 // Check that successive elements are being operated on. If not, this is
29533 // not a horizontal operation.
29534 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
29535 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
29536 if (!(LIdx == Index && RIdx == Index + 1) &&
29537 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
29542 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
29543 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
29547 /// Do target-specific dag combines on floating-point adds/subs.
29548 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
29549 const X86Subtarget &Subtarget) {
29550 EVT VT = N->getValueType(0);
29551 SDValue LHS = N->getOperand(0);
29552 SDValue RHS = N->getOperand(1);
29553 bool IsFadd = N->getOpcode() == ISD::FADD;
29554 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
29556 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
29557 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
29558 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
29559 isHorizontalBinOp(LHS, RHS, IsFadd)) {
29560 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
29561 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
29566 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
29568 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
29569 SmallVector<SDValue, 8> &Regs) {
29570 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
29571 Regs[0].getValueType() == MVT::v2i64));
29572 EVT OutVT = N->getValueType(0);
29573 EVT OutSVT = OutVT.getVectorElementType();
29574 EVT InVT = Regs[0].getValueType();
29575 EVT InSVT = InVT.getVectorElementType();
29578 // First, use mask to unset all bits that won't appear in the result.
29579 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
29580 "OutSVT can only be either i8 or i16.");
29582 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
29583 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
29584 for (auto &Reg : Regs)
29585 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
29587 MVT UnpackedVT, PackedVT;
29588 if (OutSVT == MVT::i8) {
29589 UnpackedVT = MVT::v8i16;
29590 PackedVT = MVT::v16i8;
29592 UnpackedVT = MVT::v4i32;
29593 PackedVT = MVT::v8i16;
29596 // In each iteration, truncate the type by a half size.
29597 auto RegNum = Regs.size();
29598 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
29599 j < e; j *= 2, RegNum /= 2) {
29600 for (unsigned i = 0; i < RegNum; i++)
29601 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
29602 for (unsigned i = 0; i < RegNum / 2; i++)
29603 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
29607 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
29608 // then extract a subvector as the result since v8i8 is not a legal type.
29609 if (OutVT == MVT::v8i8) {
29610 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
29611 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
29612 DAG.getIntPtrConstant(0, DL));
29614 } else if (RegNum > 1) {
29615 Regs.resize(RegNum);
29616 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29621 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
29623 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
29624 SmallVector<SDValue, 8> &Regs) {
29625 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
29626 EVT OutVT = N->getValueType(0);
29629 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
29630 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
29631 for (auto &Reg : Regs) {
29632 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29633 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29636 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
29637 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
29640 if (Regs.size() > 2) {
29641 Regs.resize(Regs.size() / 2);
29642 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29647 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
29648 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
29649 /// legalization the truncation will be translated into a BUILD_VECTOR with each
29650 /// element that is extracted from a vector and then truncated, and it is
29651 /// diffcult to do this optimization based on them.
29652 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
29653 const X86Subtarget &Subtarget) {
29654 EVT OutVT = N->getValueType(0);
29655 if (!OutVT.isVector())
29658 SDValue In = N->getOperand(0);
29659 if (!In.getValueType().isSimple())
29662 EVT InVT = In.getValueType();
29663 unsigned NumElems = OutVT.getVectorNumElements();
29665 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
29666 // SSE2, and we need to take care of it specially.
29667 // AVX512 provides vpmovdb.
29668 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
29671 EVT OutSVT = OutVT.getVectorElementType();
29672 EVT InSVT = InVT.getVectorElementType();
29673 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
29674 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
29678 // SSSE3's pshufb results in less instructions in the cases below.
29679 if (Subtarget.hasSSSE3() && NumElems == 8 &&
29680 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
29681 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
29686 // Split a long vector into vectors of legal type.
29687 unsigned RegNum = InVT.getSizeInBits() / 128;
29688 SmallVector<SDValue, 8> SubVec(RegNum);
29689 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
29690 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
29692 for (unsigned i = 0; i < RegNum; i++)
29693 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
29694 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
29696 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
29697 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
29698 // truncate 2 x v4i32 to v8i16.
29699 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
29700 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
29701 else if (InSVT == MVT::i32)
29702 return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
29707 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
29708 const X86Subtarget &Subtarget) {
29709 EVT VT = N->getValueType(0);
29710 SDValue Src = N->getOperand(0);
29713 // Try to detect AVG pattern first.
29714 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
29717 // The bitcast source is a direct mmx result.
29718 // Detect bitcasts between i32 to x86mmx
29719 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
29720 SDValue BCSrc = Src.getOperand(0);
29721 if (BCSrc.getValueType() == MVT::x86mmx)
29722 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
29725 return combineVectorTruncation(N, DAG, Subtarget);
29728 /// Do target-specific dag combines on floating point negations.
29729 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
29730 const X86Subtarget &Subtarget) {
29731 EVT VT = N->getValueType(0);
29732 EVT SVT = VT.getScalarType();
29733 SDValue Arg = N->getOperand(0);
29736 // Let legalize expand this if it isn't a legal type yet.
29737 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29740 // If we're negating a FMUL node on a target with FMA, then we can avoid the
29741 // use of a constant by performing (-0 - A*B) instead.
29742 // FIXME: Check rounding control flags as well once it becomes available.
29743 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
29744 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
29745 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
29746 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29747 Arg.getOperand(1), Zero);
29750 // If we're negating a FMA node, then we can adjust the
29751 // instruction to include the extra negation.
29752 if (Arg.hasOneUse()) {
29753 switch (Arg.getOpcode()) {
29754 case X86ISD::FMADD:
29755 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29756 Arg.getOperand(1), Arg.getOperand(2));
29757 case X86ISD::FMSUB:
29758 return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
29759 Arg.getOperand(1), Arg.getOperand(2));
29760 case X86ISD::FNMADD:
29761 return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
29762 Arg.getOperand(1), Arg.getOperand(2));
29763 case X86ISD::FNMSUB:
29764 return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
29765 Arg.getOperand(1), Arg.getOperand(2));
29771 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
29772 const X86Subtarget &Subtarget) {
29773 EVT VT = N->getValueType(0);
29774 if (VT.is512BitVector() && !Subtarget.hasDQI()) {
29775 // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
29776 // These logic operations may be executed in the integer domain.
29778 MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
29779 MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
29781 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
29782 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
29783 unsigned IntOpcode = 0;
29784 switch (N->getOpcode()) {
29785 default: llvm_unreachable("Unexpected FP logic op");
29786 case X86ISD::FOR: IntOpcode = ISD::OR; break;
29787 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
29788 case X86ISD::FAND: IntOpcode = ISD::AND; break;
29789 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
29791 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
29792 return DAG.getBitcast(VT, IntOp);
29796 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
29797 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
29798 const X86Subtarget &Subtarget) {
29799 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
29801 // F[X]OR(0.0, x) -> x
29802 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29803 if (C->getValueAPF().isPosZero())
29804 return N->getOperand(1);
29806 // F[X]OR(x, 0.0) -> x
29807 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29808 if (C->getValueAPF().isPosZero())
29809 return N->getOperand(0);
29811 return lowerX86FPLogicOp(N, DAG, Subtarget);
29814 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
29815 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
29816 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
29818 // Only perform optimizations if UnsafeMath is used.
29819 if (!DAG.getTarget().Options.UnsafeFPMath)
29822 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
29823 // into FMINC and FMAXC, which are Commutative operations.
29824 unsigned NewOp = 0;
29825 switch (N->getOpcode()) {
29826 default: llvm_unreachable("unknown opcode");
29827 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
29828 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
29831 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
29832 N->getOperand(0), N->getOperand(1));
29835 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
29836 const X86Subtarget &Subtarget) {
29837 if (Subtarget.useSoftFloat())
29840 // TODO: Check for global or instruction-level "nnan". In that case, we
29841 // should be able to lower to FMAX/FMIN alone.
29842 // TODO: If an operand is already known to be a NaN or not a NaN, this
29843 // should be an optional swap and FMAX/FMIN.
29845 EVT VT = N->getValueType(0);
29846 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
29847 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
29848 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
29851 // This takes at least 3 instructions, so favor a library call when operating
29852 // on a scalar and minimizing code size.
29853 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
29856 SDValue Op0 = N->getOperand(0);
29857 SDValue Op1 = N->getOperand(1);
29859 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
29860 DAG.getDataLayout(), *DAG.getContext(), VT);
29862 // There are 4 possibilities involving NaN inputs, and these are the required
29866 // ----------------
29867 // Num | Max | Op0 |
29868 // Op0 ----------------
29869 // NaN | Op1 | NaN |
29870 // ----------------
29872 // The SSE FP max/min instructions were not designed for this case, but rather
29874 // Min = Op1 < Op0 ? Op1 : Op0
29875 // Max = Op1 > Op0 ? Op1 : Op0
29877 // So they always return Op0 if either input is a NaN. However, we can still
29878 // use those instructions for fmaxnum by selecting away a NaN input.
29880 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
29881 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
29882 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
29883 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
29885 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
29886 // are NaN, the NaN value of Op1 is the result.
29887 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
29888 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
29891 /// Do target-specific dag combines on X86ISD::FAND nodes.
29892 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
29893 const X86Subtarget &Subtarget) {
29894 // FAND(0.0, x) -> 0.0
29895 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29896 if (C->getValueAPF().isPosZero())
29897 return N->getOperand(0);
29899 // FAND(x, 0.0) -> 0.0
29900 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29901 if (C->getValueAPF().isPosZero())
29902 return N->getOperand(1);
29904 return lowerX86FPLogicOp(N, DAG, Subtarget);
29907 /// Do target-specific dag combines on X86ISD::FANDN nodes
29908 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
29909 const X86Subtarget &Subtarget) {
29910 // FANDN(0.0, x) -> x
29911 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29912 if (C->getValueAPF().isPosZero())
29913 return N->getOperand(1);
29915 // FANDN(x, 0.0) -> 0.0
29916 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29917 if (C->getValueAPF().isPosZero())
29918 return N->getOperand(1);
29920 return lowerX86FPLogicOp(N, DAG, Subtarget);
29923 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
29924 TargetLowering::DAGCombinerInfo &DCI) {
29925 // BT ignores high bits in the bit index operand.
29926 SDValue Op1 = N->getOperand(1);
29927 if (Op1.hasOneUse()) {
29928 unsigned BitWidth = Op1.getValueSizeInBits();
29929 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
29930 APInt KnownZero, KnownOne;
29931 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
29932 !DCI.isBeforeLegalizeOps());
29933 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29934 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
29935 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
29936 DCI.CommitTargetLoweringOpt(TLO);
29941 static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
29942 SDValue Op = peekThroughBitcasts(N->getOperand(0));
29943 EVT VT = N->getValueType(0), OpVT = Op.getValueType();
29944 if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
29945 VT.getVectorElementType().getSizeInBits() ==
29946 OpVT.getVectorElementType().getSizeInBits()) {
29947 return DAG.getBitcast(VT, Op);
29952 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
29953 const X86Subtarget &Subtarget) {
29954 EVT VT = N->getValueType(0);
29955 if (!VT.isVector())
29958 SDValue N0 = N->getOperand(0);
29959 SDValue N1 = N->getOperand(1);
29960 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
29963 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
29964 // both SSE and AVX2 since there is no sign-extended shift right
29965 // operation on a vector with 64-bit elements.
29966 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
29967 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
29968 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
29969 N0.getOpcode() == ISD::SIGN_EXTEND)) {
29970 SDValue N00 = N0.getOperand(0);
29972 // EXTLOAD has a better solution on AVX2,
29973 // it may be replaced with X86ISD::VSEXT node.
29974 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
29975 if (!ISD::isNormalLoad(N00.getNode()))
29978 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
29979 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
29981 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
29987 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
29988 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
29989 /// to combine math ops, use an LEA, or use a complex addressing mode. This can
29990 /// eliminate extend, add, and shift instructions.
29991 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
29992 const X86Subtarget &Subtarget) {
29993 // TODO: This should be valid for other integer types.
29994 EVT VT = Sext->getValueType(0);
29995 if (VT != MVT::i64)
29998 // We need an 'add nsw' feeding into the 'sext'.
29999 SDValue Add = Sext->getOperand(0);
30000 if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
30003 // Having a constant operand to the 'add' ensures that we are not increasing
30004 // the instruction count because the constant is extended for free below.
30005 // A constant operand can also become the displacement field of an LEA.
30006 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
30010 // Don't make the 'add' bigger if there's no hope of combining it with some
30011 // other 'add' or 'shl' instruction.
30012 // TODO: It may be profitable to generate simpler LEA instructions in place
30013 // of single 'add' instructions, but the cost model for selecting an LEA
30014 // currently has a high threshold.
30015 bool HasLEAPotential = false;
30016 for (auto *User : Sext->uses()) {
30017 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
30018 HasLEAPotential = true;
30022 if (!HasLEAPotential)
30025 // Everything looks good, so pull the 'sext' ahead of the 'add'.
30026 int64_t AddConstant = AddOp1->getSExtValue();
30027 SDValue AddOp0 = Add.getOperand(0);
30028 SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
30029 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
30031 // The wider add is guaranteed to not wrap because both operands are
30034 Flags.setNoSignedWrap(true);
30035 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
30038 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
30039 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
30040 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
30041 /// extends from AH (which we otherwise need to do contortions to access).
30042 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
30043 SDValue N0 = N->getOperand(0);
30044 auto OpcodeN = N->getOpcode();
30045 auto OpcodeN0 = N0.getOpcode();
30046 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
30047 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
30050 EVT VT = N->getValueType(0);
30051 EVT InVT = N0.getValueType();
30052 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
30055 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
30056 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
30057 : X86ISD::UDIVREM8_ZEXT_HREG;
30058 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
30060 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
30061 return R.getValue(1);
30064 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
30065 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
30066 /// with UNDEFs) of the input to vectors of the same size as the target type
30067 /// which then extends the lowest elements.
30068 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
30069 TargetLowering::DAGCombinerInfo &DCI,
30070 const X86Subtarget &Subtarget) {
30071 unsigned Opcode = N->getOpcode();
30072 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
30074 if (!DCI.isBeforeLegalizeOps())
30076 if (!Subtarget.hasSSE2())
30079 SDValue N0 = N->getOperand(0);
30080 EVT VT = N->getValueType(0);
30081 EVT SVT = VT.getScalarType();
30082 EVT InVT = N0.getValueType();
30083 EVT InSVT = InVT.getScalarType();
30085 // Input type must be a vector and we must be extending legal integer types.
30086 if (!VT.isVector())
30088 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
30090 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
30093 // On AVX2+ targets, if the input/output types are both legal then we will be
30094 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
30095 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
30096 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
30101 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
30102 EVT InVT = N.getValueType();
30103 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
30104 Size / InVT.getScalarSizeInBits());
30105 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
30106 DAG.getUNDEF(InVT));
30108 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
30111 // If target-size is less than 128-bits, extend to a type that would extend
30112 // to 128 bits, extend that and extract the original target vector.
30113 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
30114 unsigned Scale = 128 / VT.getSizeInBits();
30116 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
30117 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
30118 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
30119 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
30120 DAG.getIntPtrConstant(0, DL));
30123 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
30124 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
30125 // Also use this if we don't have SSE41 to allow the legalizer do its job.
30126 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
30127 (VT.is256BitVector() && Subtarget.hasInt256())) {
30128 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
30129 return Opcode == ISD::SIGN_EXTEND
30130 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
30131 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
30134 // On pre-AVX2 targets, split into 128-bit nodes of
30135 // ISD::*_EXTEND_VECTOR_INREG.
30136 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
30137 unsigned NumVecs = VT.getSizeInBits() / 128;
30138 unsigned NumSubElts = 128 / SVT.getSizeInBits();
30139 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
30140 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
30142 SmallVector<SDValue, 8> Opnds;
30143 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
30144 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
30145 DAG.getIntPtrConstant(Offset, DL));
30146 SrcVec = ExtendVecSize(DL, SrcVec, 128);
30147 SrcVec = Opcode == ISD::SIGN_EXTEND
30148 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
30149 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
30150 Opnds.push_back(SrcVec);
30152 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
30158 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
30159 TargetLowering::DAGCombinerInfo &DCI,
30160 const X86Subtarget &Subtarget) {
30161 SDValue N0 = N->getOperand(0);
30162 EVT VT = N->getValueType(0);
30163 EVT InVT = N0.getValueType();
30166 if (SDValue DivRem8 = getDivRem8(N, DAG))
30169 if (!DCI.isBeforeLegalizeOps()) {
30170 if (InVT == MVT::i1) {
30171 SDValue Zero = DAG.getConstant(0, DL, VT);
30173 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
30174 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
30179 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30182 if (Subtarget.hasAVX() && VT.is256BitVector())
30183 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30186 if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
30192 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
30193 const X86Subtarget &Subtarget) {
30195 EVT VT = N->getValueType(0);
30197 // Let legalize expand this if it isn't a legal type yet.
30198 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30201 EVT ScalarVT = VT.getScalarType();
30202 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
30205 SDValue A = N->getOperand(0);
30206 SDValue B = N->getOperand(1);
30207 SDValue C = N->getOperand(2);
30209 bool NegA = (A.getOpcode() == ISD::FNEG);
30210 bool NegB = (B.getOpcode() == ISD::FNEG);
30211 bool NegC = (C.getOpcode() == ISD::FNEG);
30213 // Negative multiplication when NegA xor NegB
30214 bool NegMul = (NegA != NegB);
30216 A = A.getOperand(0);
30218 B = B.getOperand(0);
30220 C = C.getOperand(0);
30224 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
30226 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
30228 return DAG.getNode(Opcode, dl, VT, A, B, C);
30231 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
30232 TargetLowering::DAGCombinerInfo &DCI,
30233 const X86Subtarget &Subtarget) {
30234 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
30235 // (and (i32 x86isd::setcc_carry), 1)
30236 // This eliminates the zext. This transformation is necessary because
30237 // ISD::SETCC is always legalized to i8.
30239 SDValue N0 = N->getOperand(0);
30240 EVT VT = N->getValueType(0);
30242 if (N0.getOpcode() == ISD::AND &&
30244 N0.getOperand(0).hasOneUse()) {
30245 SDValue N00 = N0.getOperand(0);
30246 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30247 if (!isOneConstant(N0.getOperand(1)))
30249 return DAG.getNode(ISD::AND, dl, VT,
30250 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30251 N00.getOperand(0), N00.getOperand(1)),
30252 DAG.getConstant(1, dl, VT));
30256 if (N0.getOpcode() == ISD::TRUNCATE &&
30258 N0.getOperand(0).hasOneUse()) {
30259 SDValue N00 = N0.getOperand(0);
30260 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30261 return DAG.getNode(ISD::AND, dl, VT,
30262 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30263 N00.getOperand(0), N00.getOperand(1)),
30264 DAG.getConstant(1, dl, VT));
30268 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30271 if (VT.is256BitVector())
30272 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30275 if (SDValue DivRem8 = getDivRem8(N, DAG))
30281 /// Optimize x == -y --> x+y == 0
30282 /// x != -y --> x+y != 0
30283 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
30284 const X86Subtarget &Subtarget) {
30285 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
30286 SDValue LHS = N->getOperand(0);
30287 SDValue RHS = N->getOperand(1);
30288 EVT VT = N->getValueType(0);
30291 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
30292 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
30293 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
30294 LHS.getOperand(1));
30295 return DAG.getSetCC(DL, N->getValueType(0), addV,
30296 DAG.getConstant(0, DL, addV.getValueType()), CC);
30298 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
30299 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
30300 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
30301 RHS.getOperand(1));
30302 return DAG.getSetCC(DL, N->getValueType(0), addV,
30303 DAG.getConstant(0, DL, addV.getValueType()), CC);
30306 if (VT.getScalarType() == MVT::i1 &&
30307 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
30309 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30310 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30311 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30313 if (!IsSEXT0 || !IsVZero1) {
30314 // Swap the operands and update the condition code.
30315 std::swap(LHS, RHS);
30316 CC = ISD::getSetCCSwappedOperands(CC);
30318 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30319 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30320 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30323 if (IsSEXT0 && IsVZero1) {
30324 assert(VT == LHS.getOperand(0).getValueType() &&
30325 "Uexpected operand type");
30326 if (CC == ISD::SETGT)
30327 return DAG.getConstant(0, DL, VT);
30328 if (CC == ISD::SETLE)
30329 return DAG.getConstant(1, DL, VT);
30330 if (CC == ISD::SETEQ || CC == ISD::SETGE)
30331 return DAG.getNOT(DL, LHS.getOperand(0), VT);
30333 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
30334 "Unexpected condition code!");
30335 return LHS.getOperand(0);
30339 // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
30340 // via legalization because v4i32 is not a legal type.
30341 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
30342 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
30347 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
30349 // Gather and Scatter instructions use k-registers for masks. The type of
30350 // the masks is v*i1. So the mask will be truncated anyway.
30351 // The SIGN_EXTEND_INREG my be dropped.
30352 SDValue Mask = N->getOperand(2);
30353 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
30354 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
30355 NewOps[2] = Mask.getOperand(0);
30356 DAG.UpdateNodeOperands(N, NewOps);
30361 // Helper function of performSETCCCombine. It is to materialize "setb reg"
30362 // as "sbb reg,reg", since it can be extended without zext and produces
30363 // an all-ones bit which is more useful than 0/1 in some cases.
30364 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
30365 SelectionDAG &DAG, MVT VT) {
30367 return DAG.getNode(ISD::AND, DL, VT,
30368 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30369 DAG.getConstant(X86::COND_B, DL, MVT::i8),
30371 DAG.getConstant(1, DL, VT));
30372 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
30373 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
30374 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30375 DAG.getConstant(X86::COND_B, DL, MVT::i8),
30379 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
30380 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
30381 TargetLowering::DAGCombinerInfo &DCI,
30382 const X86Subtarget &Subtarget) {
30384 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
30385 SDValue EFLAGS = N->getOperand(1);
30387 if (CC == X86::COND_A) {
30388 // Try to convert COND_A into COND_B in an attempt to facilitate
30389 // materializing "setb reg".
30391 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
30392 // cannot take an immediate as its first operand.
30394 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
30395 EFLAGS.getValueType().isInteger() &&
30396 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
30397 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
30398 EFLAGS.getNode()->getVTList(),
30399 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
30400 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
30401 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
30405 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
30406 // a zext and produces an all-ones bit which is more useful than 0/1 in some
30408 if (CC == X86::COND_B)
30409 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
30411 // Try to simplify the EFLAGS and condition code operands.
30412 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30413 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30414 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
30420 /// Optimize branch condition evaluation.
30421 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
30422 TargetLowering::DAGCombinerInfo &DCI,
30423 const X86Subtarget &Subtarget) {
30425 SDValue EFLAGS = N->getOperand(3);
30426 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
30428 // Try to simplify the EFLAGS and condition code operands.
30429 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
30430 // RAUW them under us.
30431 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30432 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30433 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
30434 N->getOperand(1), Cond, Flags);
30440 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
30441 SelectionDAG &DAG) {
30442 // Take advantage of vector comparisons producing 0 or -1 in each lane to
30443 // optimize away operation when it's from a constant.
30445 // The general transformation is:
30446 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
30447 // AND(VECTOR_CMP(x,y), constant2)
30448 // constant2 = UNARYOP(constant)
30450 // Early exit if this isn't a vector operation, the operand of the
30451 // unary operation isn't a bitwise AND, or if the sizes of the operations
30452 // aren't the same.
30453 EVT VT = N->getValueType(0);
30454 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
30455 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
30456 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
30459 // Now check that the other operand of the AND is a constant. We could
30460 // make the transformation for non-constant splats as well, but it's unclear
30461 // that would be a benefit as it would not eliminate any operations, just
30462 // perform one more step in scalar code before moving to the vector unit.
30463 if (BuildVectorSDNode *BV =
30464 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
30465 // Bail out if the vector isn't a constant.
30466 if (!BV->isConstant())
30469 // Everything checks out. Build up the new and improved node.
30471 EVT IntVT = BV->getValueType(0);
30472 // Create a new constant of the appropriate type for the transformed
30474 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
30475 // The AND node needs bitcasts to/from an integer vector type around it.
30476 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
30477 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
30478 N->getOperand(0)->getOperand(0), MaskConst);
30479 SDValue Res = DAG.getBitcast(VT, NewAnd);
30486 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
30487 const X86Subtarget &Subtarget) {
30488 SDValue Op0 = N->getOperand(0);
30489 EVT VT = N->getValueType(0);
30490 EVT InVT = Op0.getValueType();
30491 EVT InSVT = InVT.getScalarType();
30492 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30494 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
30495 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
30496 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30498 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30499 InVT.getVectorNumElements());
30500 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
30502 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
30503 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
30505 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30511 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
30512 const X86Subtarget &Subtarget) {
30513 // First try to optimize away the conversion entirely when it's
30514 // conditionally from a constant. Vectors only.
30515 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
30518 // Now move on to more general possibilities.
30519 SDValue Op0 = N->getOperand(0);
30520 EVT VT = N->getValueType(0);
30521 EVT InVT = Op0.getValueType();
30522 EVT InSVT = InVT.getScalarType();
30524 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
30525 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
30526 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30528 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30529 InVT.getVectorNumElements());
30530 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
30531 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30534 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
30535 // a 32-bit target where SSE doesn't support i64->FP operations.
30536 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
30537 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
30538 EVT LdVT = Ld->getValueType(0);
30540 // This transformation is not supported if the result type is f16 or f128.
30541 if (VT == MVT::f16 || VT == MVT::f128)
30544 if (!Ld->isVolatile() && !VT.isVector() &&
30545 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
30546 !Subtarget.is64Bit() && LdVT == MVT::i64) {
30547 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
30548 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
30549 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
30556 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
30557 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
30558 X86TargetLowering::DAGCombinerInfo &DCI) {
30559 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
30560 // the result is either zero or one (depending on the input carry bit).
30561 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
30562 if (X86::isZeroNode(N->getOperand(0)) &&
30563 X86::isZeroNode(N->getOperand(1)) &&
30564 // We don't have a good way to replace an EFLAGS use, so only do this when
30566 SDValue(N, 1).use_empty()) {
30568 EVT VT = N->getValueType(0);
30569 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
30570 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
30571 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
30572 DAG.getConstant(X86::COND_B, DL,
30575 DAG.getConstant(1, DL, VT));
30576 return DCI.CombineTo(N, Res1, CarryOut);
30582 /// fold (add Y, (sete X, 0)) -> adc 0, Y
30583 /// (add Y, (setne X, 0)) -> sbb -1, Y
30584 /// (sub (sete X, 0), Y) -> sbb 0, Y
30585 /// (sub (setne X, 0), Y) -> adc -1, Y
30586 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
30589 // Look through ZExts.
30590 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
30591 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
30594 SDValue SetCC = Ext.getOperand(0);
30595 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
30598 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
30599 if (CC != X86::COND_E && CC != X86::COND_NE)
30602 SDValue Cmp = SetCC.getOperand(1);
30603 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
30604 !X86::isZeroNode(Cmp.getOperand(1)) ||
30605 !Cmp.getOperand(0).getValueType().isInteger())
30608 SDValue CmpOp0 = Cmp.getOperand(0);
30609 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
30610 DAG.getConstant(1, DL, CmpOp0.getValueType()));
30612 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
30613 if (CC == X86::COND_NE)
30614 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
30615 DL, OtherVal.getValueType(), OtherVal,
30616 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
30618 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
30619 DL, OtherVal.getValueType(), OtherVal,
30620 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
30623 static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
30624 const X86Subtarget &Subtarget) {
30626 EVT VT = N->getValueType(0);
30627 SDValue Op0 = N->getOperand(0);
30628 SDValue Op1 = N->getOperand(1);
30630 if (!VT.isVector() || !VT.isSimple() ||
30631 !(VT.getVectorElementType() == MVT::i32))
30634 unsigned RegSize = 128;
30635 if (Subtarget.hasBWI())
30637 else if (Subtarget.hasAVX2())
30640 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
30641 if (VT.getSizeInBits() / 4 > RegSize)
30644 // Detect the following pattern:
30646 // 1: %2 = zext <N x i8> %0 to <N x i32>
30647 // 2: %3 = zext <N x i8> %1 to <N x i32>
30648 // 3: %4 = sub nsw <N x i32> %2, %3
30649 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30650 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30651 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30652 // 7: %8 = add nsw <N x i32> %7, %vec.phi
30654 // The last instruction must be a reduction add. The instructions 3-6 forms an
30655 // ABSDIFF pattern.
30657 // The two operands of reduction add are from PHI and a select-op as in line 7
30659 SDValue SelectOp, Phi;
30660 if (Op0.getOpcode() == ISD::VSELECT) {
30663 } else if (Op1.getOpcode() == ISD::VSELECT) {
30669 // Check the condition of the select instruction is greater-than.
30670 SDValue SetCC = SelectOp->getOperand(0);
30671 if (SetCC.getOpcode() != ISD::SETCC)
30673 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30674 if (CC != ISD::SETGT)
30677 Op0 = SelectOp->getOperand(1);
30678 Op1 = SelectOp->getOperand(2);
30680 // The second operand of SelectOp Op1 is the negation of the first operand
30681 // Op0, which is implemented as 0 - Op0.
30682 if (!(Op1.getOpcode() == ISD::SUB &&
30683 ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
30684 Op1.getOperand(1) == Op0))
30687 // The first operand of SetCC is the first operand of SelectOp, which is the
30688 // difference between two input vectors.
30689 if (SetCC.getOperand(0) != Op0)
30692 // The second operand of > comparison can be either -1 or 0.
30693 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30694 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30697 // The first operand of SelectOp is the difference between two input vectors.
30698 if (Op0.getOpcode() != ISD::SUB)
30701 Op1 = Op0.getOperand(1);
30702 Op0 = Op0.getOperand(0);
30704 // Check if the operands of the diff are zero-extended from vectors of i8.
30705 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30706 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30707 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30708 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30711 // SAD pattern detected. Now build a SAD instruction and an addition for
30712 // reduction. Note that the number of elments of the result of SAD is less
30713 // than the number of elements of its input. Therefore, we could only update
30714 // part of elements in the reduction vector.
30716 // Legalize the type of the inputs of PSADBW.
30717 EVT InVT = Op0.getOperand(0).getValueType();
30718 if (InVT.getSizeInBits() <= 128)
30720 else if (InVT.getSizeInBits() <= 256)
30723 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30724 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30725 Ops[0] = Op0.getOperand(0);
30726 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30727 Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30728 Ops[0] = Op1.getOperand(0);
30729 Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30731 // The output of PSADBW is a vector of i64.
30732 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30733 SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
30735 // We need to turn the vector of i64 into a vector of i32.
30736 // If the reduction vector is at least as wide as the psadbw result, just
30737 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
30739 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30740 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
30741 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
30743 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
30745 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
30746 // Update part of elements of the reduction vector. This is done by first
30747 // extracting a sub-vector from it, updating this sub-vector, and inserting
30749 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
30750 DAG.getIntPtrConstant(0, DL));
30751 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
30752 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
30753 DAG.getIntPtrConstant(0, DL));
30755 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
30758 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
30759 const X86Subtarget &Subtarget) {
30760 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
30761 if (Flags->hasVectorReduction()) {
30762 if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
30765 EVT VT = N->getValueType(0);
30766 SDValue Op0 = N->getOperand(0);
30767 SDValue Op1 = N->getOperand(1);
30769 // Try to synthesize horizontal adds from adds of shuffles.
30770 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30771 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30772 isHorizontalBinOp(Op0, Op1, true))
30773 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
30775 return OptimizeConditionalInDecrement(N, DAG);
30778 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
30779 const X86Subtarget &Subtarget) {
30780 SDValue Op0 = N->getOperand(0);
30781 SDValue Op1 = N->getOperand(1);
30783 // X86 can't encode an immediate LHS of a sub. See if we can push the
30784 // negation into a preceding instruction.
30785 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
30786 // If the RHS of the sub is a XOR with one use and a constant, invert the
30787 // immediate. Then add one to the LHS of the sub so we can turn
30788 // X-Y -> X+~Y+1, saving one register.
30789 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
30790 isa<ConstantSDNode>(Op1.getOperand(1))) {
30791 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
30792 EVT VT = Op0.getValueType();
30793 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
30795 DAG.getConstant(~XorC, SDLoc(Op1), VT));
30796 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
30797 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
30801 // Try to synthesize horizontal adds from adds of shuffles.
30802 EVT VT = N->getValueType(0);
30803 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30804 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30805 isHorizontalBinOp(Op0, Op1, true))
30806 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
30808 return OptimizeConditionalInDecrement(N, DAG);
30811 static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
30812 TargetLowering::DAGCombinerInfo &DCI,
30813 const X86Subtarget &Subtarget) {
30815 MVT VT = N->getSimpleValueType(0);
30816 MVT SVT = VT.getVectorElementType();
30817 SDValue Op = N->getOperand(0);
30818 MVT OpVT = Op.getSimpleValueType();
30819 MVT OpEltVT = OpVT.getVectorElementType();
30820 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
30822 // Perform any constant folding.
30823 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
30824 SmallVector<SDValue, 4> Vals;
30825 for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30826 SDValue OpElt = Op.getOperand(i);
30827 if (OpElt.getOpcode() == ISD::UNDEF) {
30828 Vals.push_back(DAG.getUNDEF(SVT));
30831 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
30832 assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
30833 Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
30834 Vals.push_back(DAG.getConstant(Cst, DL, SVT));
30836 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
30839 // (vzext (bitcast (vzext (x)) -> (vzext x)
30840 SDValue V = peekThroughBitcasts(Op);
30841 if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
30842 MVT InnerVT = V.getSimpleValueType();
30843 MVT InnerEltVT = InnerVT.getVectorElementType();
30845 // If the element sizes match exactly, we can just do one larger vzext. This
30846 // is always an exact type match as vzext operates on integer types.
30847 if (OpEltVT == InnerEltVT) {
30848 assert(OpVT == InnerVT && "Types must match for vzext!");
30849 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
30852 // The only other way we can combine them is if only a single element of the
30853 // inner vzext is used in the input to the outer vzext.
30854 if (InnerEltVT.getSizeInBits() < InputBits)
30857 // In this case, the inner vzext is completely dead because we're going to
30858 // only look at bits inside of the low element. Just do the outer vzext on
30859 // a bitcast of the input to the inner.
30860 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
30863 // Check if we can bypass extracting and re-inserting an element of an input
30864 // vector. Essentially:
30865 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
30866 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
30867 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
30868 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
30869 SDValue ExtractedV = V.getOperand(0);
30870 SDValue OrigV = ExtractedV.getOperand(0);
30871 if (isNullConstant(ExtractedV.getOperand(1))) {
30872 MVT OrigVT = OrigV.getSimpleValueType();
30873 // Extract a subvector if necessary...
30874 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
30875 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
30876 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
30877 OrigVT.getVectorNumElements() / Ratio);
30878 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
30879 DAG.getIntPtrConstant(0, DL));
30881 Op = DAG.getBitcast(OpVT, OrigV);
30882 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
30889 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
30890 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
30891 const X86Subtarget &Subtarget) {
30892 SDValue Chain = N->getOperand(0);
30893 SDValue LHS = N->getOperand(1);
30894 SDValue RHS = N->getOperand(2);
30895 MVT VT = RHS.getSimpleValueType();
30898 auto *C = dyn_cast<ConstantSDNode>(RHS);
30899 if (!C || C->getZExtValue() != 1)
30902 RHS = DAG.getConstant(-1, DL, VT);
30903 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
30904 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
30905 DAG.getVTList(MVT::i32, MVT::Other),
30906 {Chain, LHS, RHS}, VT, MMO);
30909 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
30910 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
30911 SDValue Op0 = N->getOperand(0);
30912 SDValue Op1 = N->getOperand(1);
30914 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
30917 EVT VT = N->getValueType(0);
30920 return DAG.getNode(X86ISD::TESTM, DL, VT,
30921 Op0->getOperand(0), Op0->getOperand(1));
30924 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
30925 const X86Subtarget &Subtarget) {
30926 MVT VT = N->getSimpleValueType(0);
30929 if (N->getOperand(0) == N->getOperand(1)) {
30930 if (N->getOpcode() == X86ISD::PCMPEQ)
30931 return getOnesVector(VT, Subtarget, DAG, DL);
30932 if (N->getOpcode() == X86ISD::PCMPGT)
30933 return getZeroVector(VT, Subtarget, DAG, DL);
30940 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
30941 DAGCombinerInfo &DCI) const {
30942 SelectionDAG &DAG = DCI.DAG;
30943 switch (N->getOpcode()) {
30945 case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
30948 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
30949 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
30950 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
30951 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
30952 case ISD::SUB: return combineSub(N, DAG, Subtarget);
30953 case X86ISD::ADC: return combineADC(N, DAG, DCI);
30954 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
30957 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
30958 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
30959 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
30960 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
30961 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
30962 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
30963 case ISD::STORE: return combineStore(N, DAG, Subtarget);
30964 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
30965 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
30966 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
30968 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
30969 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
30970 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
30972 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
30974 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
30976 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
30977 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
30978 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
30979 case X86ISD::BT: return combineBT(N, DAG, DCI);
30980 case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG);
30981 case ISD::ANY_EXTEND:
30982 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
30983 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
30984 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
30985 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
30986 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
30987 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
30988 case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget);
30989 case X86ISD::SHUFP: // Handle all target specific shuffles
30990 case X86ISD::INSERTPS:
30991 case X86ISD::PALIGNR:
30992 case X86ISD::VSHLDQ:
30993 case X86ISD::VSRLDQ:
30994 case X86ISD::BLENDI:
30995 case X86ISD::UNPCKH:
30996 case X86ISD::UNPCKL:
30997 case X86ISD::MOVHLPS:
30998 case X86ISD::MOVLHPS:
30999 case X86ISD::PSHUFB:
31000 case X86ISD::PSHUFD:
31001 case X86ISD::PSHUFHW:
31002 case X86ISD::PSHUFLW:
31003 case X86ISD::MOVSHDUP:
31004 case X86ISD::MOVSLDUP:
31005 case X86ISD::MOVDDUP:
31006 case X86ISD::MOVSS:
31007 case X86ISD::MOVSD:
31008 case X86ISD::VPPERM:
31009 case X86ISD::VPERMI:
31010 case X86ISD::VPERMV:
31011 case X86ISD::VPERMV3:
31012 case X86ISD::VPERMIL2:
31013 case X86ISD::VPERMILPI:
31014 case X86ISD::VPERMILPV:
31015 case X86ISD::VPERM2X128:
31016 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
31017 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
31019 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
31020 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
31021 case X86ISD::TESTM: return combineTestM(N, DAG);
31022 case X86ISD::PCMPEQ:
31023 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
31029 /// Return true if the target has native support for the specified value type
31030 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
31031 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
31032 /// some i16 instructions are slow.
31033 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
31034 if (!isTypeLegal(VT))
31036 if (VT != MVT::i16)
31043 case ISD::SIGN_EXTEND:
31044 case ISD::ZERO_EXTEND:
31045 case ISD::ANY_EXTEND:
31058 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
31059 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
31060 /// we don't adjust the stack we clobber the first frame index.
31061 /// See X86InstrInfo::copyPhysReg.
31062 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
31063 MachineFunction *MF) const {
31064 const MachineRegisterInfo &MRI = MF->getRegInfo();
31066 return any_of(MRI.reg_instructions(X86::EFLAGS),
31067 [](const MachineInstr &RI) { return RI.isCopy(); });
31070 /// This method query the target whether it is beneficial for dag combiner to
31071 /// promote the specified node. If true, it should return the desired promotion
31072 /// type by reference.
31073 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
31074 EVT VT = Op.getValueType();
31075 if (VT != MVT::i16)
31078 bool Promote = false;
31079 bool Commute = false;
31080 switch (Op.getOpcode()) {
31082 case ISD::SIGN_EXTEND:
31083 case ISD::ZERO_EXTEND:
31084 case ISD::ANY_EXTEND:
31089 SDValue N0 = Op.getOperand(0);
31090 // Look out for (store (shl (load), x)).
31091 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
31104 SDValue N0 = Op.getOperand(0);
31105 SDValue N1 = Op.getOperand(1);
31106 if (!Commute && MayFoldLoad(N1))
31108 // Avoid disabling potential load folding opportunities.
31109 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
31111 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
31121 //===----------------------------------------------------------------------===//
31122 // X86 Inline Assembly Support
31123 //===----------------------------------------------------------------------===//
31125 // Helper to match a string separated by whitespace.
31126 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
31127 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
31129 for (StringRef Piece : Pieces) {
31130 if (!S.startswith(Piece)) // Check if the piece matches.
31133 S = S.substr(Piece.size());
31134 StringRef::size_type Pos = S.find_first_not_of(" \t");
31135 if (Pos == 0) // We matched a prefix.
31144 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
31146 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
31147 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
31148 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
31149 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
31151 if (AsmPieces.size() == 3)
31153 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
31160 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
31161 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
31163 const std::string &AsmStr = IA->getAsmString();
31165 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
31166 if (!Ty || Ty->getBitWidth() % 16 != 0)
31169 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
31170 SmallVector<StringRef, 4> AsmPieces;
31171 SplitString(AsmStr, AsmPieces, ";\n");
31173 switch (AsmPieces.size()) {
31174 default: return false;
31176 // FIXME: this should verify that we are targeting a 486 or better. If not,
31177 // we will turn this bswap into something that will be lowered to logical
31178 // ops instead of emitting the bswap asm. For now, we don't support 486 or
31179 // lower so don't worry about this.
31181 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
31182 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
31183 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
31184 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
31185 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
31186 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
31187 // No need to check constraints, nothing other than the equivalent of
31188 // "=r,0" would be valid here.
31189 return IntrinsicLowering::LowerToByteSwap(CI);
31192 // rorw $$8, ${0:w} --> llvm.bswap.i16
31193 if (CI->getType()->isIntegerTy(16) &&
31194 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31195 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
31196 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
31198 StringRef ConstraintsStr = IA->getConstraintString();
31199 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31200 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31201 if (clobbersFlagRegisters(AsmPieces))
31202 return IntrinsicLowering::LowerToByteSwap(CI);
31206 if (CI->getType()->isIntegerTy(32) &&
31207 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31208 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
31209 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
31210 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
31212 StringRef ConstraintsStr = IA->getConstraintString();
31213 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31214 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31215 if (clobbersFlagRegisters(AsmPieces))
31216 return IntrinsicLowering::LowerToByteSwap(CI);
31219 if (CI->getType()->isIntegerTy(64)) {
31220 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
31221 if (Constraints.size() >= 2 &&
31222 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
31223 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
31224 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
31225 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
31226 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
31227 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
31228 return IntrinsicLowering::LowerToByteSwap(CI);
31236 /// Given a constraint letter, return the type of constraint for this target.
31237 X86TargetLowering::ConstraintType
31238 X86TargetLowering::getConstraintType(StringRef Constraint) const {
31239 if (Constraint.size() == 1) {
31240 switch (Constraint[0]) {
31251 return C_RegisterClass;
31275 return TargetLowering::getConstraintType(Constraint);
31278 /// Examine constraint type and operand type and determine a weight value.
31279 /// This object must already have been set up with the operand type
31280 /// and the current alternative constraint selected.
31281 TargetLowering::ConstraintWeight
31282 X86TargetLowering::getSingleConstraintMatchWeight(
31283 AsmOperandInfo &info, const char *constraint) const {
31284 ConstraintWeight weight = CW_Invalid;
31285 Value *CallOperandVal = info.CallOperandVal;
31286 // If we don't have a value, we can't do a match,
31287 // but allow it at the lowest weight.
31288 if (!CallOperandVal)
31290 Type *type = CallOperandVal->getType();
31291 // Look at the constraint type.
31292 switch (*constraint) {
31294 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
31305 if (CallOperandVal->getType()->isIntegerTy())
31306 weight = CW_SpecificReg;
31311 if (type->isFloatingPointTy())
31312 weight = CW_SpecificReg;
31315 if (type->isX86_MMXTy() && Subtarget.hasMMX())
31316 weight = CW_SpecificReg;
31320 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
31321 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
31322 weight = CW_Register;
31325 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
31326 if (C->getZExtValue() <= 31)
31327 weight = CW_Constant;
31331 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31332 if (C->getZExtValue() <= 63)
31333 weight = CW_Constant;
31337 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31338 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
31339 weight = CW_Constant;
31343 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31344 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
31345 weight = CW_Constant;
31349 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31350 if (C->getZExtValue() <= 3)
31351 weight = CW_Constant;
31355 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31356 if (C->getZExtValue() <= 0xff)
31357 weight = CW_Constant;
31362 if (isa<ConstantFP>(CallOperandVal)) {
31363 weight = CW_Constant;
31367 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31368 if ((C->getSExtValue() >= -0x80000000LL) &&
31369 (C->getSExtValue() <= 0x7fffffffLL))
31370 weight = CW_Constant;
31374 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31375 if (C->getZExtValue() <= 0xffffffff)
31376 weight = CW_Constant;
31383 /// Try to replace an X constraint, which matches anything, with another that
31384 /// has more specific requirements based on the type of the corresponding
31386 const char *X86TargetLowering::
31387 LowerXConstraint(EVT ConstraintVT) const {
31388 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
31389 // 'f' like normal targets.
31390 if (ConstraintVT.isFloatingPoint()) {
31391 if (Subtarget.hasSSE2())
31393 if (Subtarget.hasSSE1())
31397 return TargetLowering::LowerXConstraint(ConstraintVT);
31400 /// Lower the specified operand into the Ops vector.
31401 /// If it is invalid, don't add anything to Ops.
31402 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
31403 std::string &Constraint,
31404 std::vector<SDValue>&Ops,
31405 SelectionDAG &DAG) const {
31408 // Only support length 1 constraints for now.
31409 if (Constraint.length() > 1) return;
31411 char ConstraintLetter = Constraint[0];
31412 switch (ConstraintLetter) {
31415 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31416 if (C->getZExtValue() <= 31) {
31417 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31418 Op.getValueType());
31424 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31425 if (C->getZExtValue() <= 63) {
31426 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31427 Op.getValueType());
31433 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31434 if (isInt<8>(C->getSExtValue())) {
31435 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31436 Op.getValueType());
31442 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31443 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
31444 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
31445 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
31446 Op.getValueType());
31452 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31453 if (C->getZExtValue() <= 3) {
31454 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31455 Op.getValueType());
31461 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31462 if (C->getZExtValue() <= 255) {
31463 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31464 Op.getValueType());
31470 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31471 if (C->getZExtValue() <= 127) {
31472 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31473 Op.getValueType());
31479 // 32-bit signed value
31480 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31481 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31482 C->getSExtValue())) {
31483 // Widen to 64 bits here to get it sign extended.
31484 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
31487 // FIXME gcc accepts some relocatable values here too, but only in certain
31488 // memory models; it's complicated.
31493 // 32-bit unsigned value
31494 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31495 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31496 C->getZExtValue())) {
31497 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31498 Op.getValueType());
31502 // FIXME gcc accepts some relocatable values here too, but only in certain
31503 // memory models; it's complicated.
31507 // Literal immediates are always ok.
31508 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
31509 // Widen to 64 bits here to get it sign extended.
31510 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
31514 // In any sort of PIC mode addresses need to be computed at runtime by
31515 // adding in a register or some sort of table lookup. These can't
31516 // be used as immediates.
31517 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
31520 // If we are in non-pic codegen mode, we allow the address of a global (with
31521 // an optional displacement) to be used with 'i'.
31522 GlobalAddressSDNode *GA = nullptr;
31523 int64_t Offset = 0;
31525 // Match either (GA), (GA+C), (GA+C1+C2), etc.
31527 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
31528 Offset += GA->getOffset();
31530 } else if (Op.getOpcode() == ISD::ADD) {
31531 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31532 Offset += C->getZExtValue();
31533 Op = Op.getOperand(0);
31536 } else if (Op.getOpcode() == ISD::SUB) {
31537 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31538 Offset += -C->getZExtValue();
31539 Op = Op.getOperand(0);
31544 // Otherwise, this isn't something we can handle, reject it.
31548 const GlobalValue *GV = GA->getGlobal();
31549 // If we require an extra load to get this address, as in PIC mode, we
31550 // can't accept it.
31551 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
31554 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
31555 GA->getValueType(0), Offset);
31560 if (Result.getNode()) {
31561 Ops.push_back(Result);
31564 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
31567 /// Check if \p RC is a general purpose register class.
31568 /// I.e., GR* or one of their variant.
31569 static bool isGRClass(const TargetRegisterClass &RC) {
31570 switch (RC.getID()) {
31571 case X86::GR8RegClassID:
31572 case X86::GR8_ABCD_LRegClassID:
31573 case X86::GR8_ABCD_HRegClassID:
31574 case X86::GR8_NOREXRegClassID:
31575 case X86::GR16RegClassID:
31576 case X86::GR16_ABCDRegClassID:
31577 case X86::GR16_NOREXRegClassID:
31578 case X86::GR32RegClassID:
31579 case X86::GR32_ABCDRegClassID:
31580 case X86::GR32_TCRegClassID:
31581 case X86::GR32_NOREXRegClassID:
31582 case X86::GR32_NOAXRegClassID:
31583 case X86::GR32_NOSPRegClassID:
31584 case X86::GR32_NOREX_NOSPRegClassID:
31585 case X86::GR32_ADRegClassID:
31586 case X86::GR64RegClassID:
31587 case X86::GR64_ABCDRegClassID:
31588 case X86::GR64_TCRegClassID:
31589 case X86::GR64_TCW64RegClassID:
31590 case X86::GR64_NOREXRegClassID:
31591 case X86::GR64_NOSPRegClassID:
31592 case X86::GR64_NOREX_NOSPRegClassID:
31593 case X86::LOW32_ADDR_ACCESSRegClassID:
31594 case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
31601 /// Check if \p RC is a vector register class.
31602 /// I.e., FR* / VR* or one of their variant.
31603 static bool isFRClass(const TargetRegisterClass &RC) {
31604 switch (RC.getID()) {
31605 case X86::FR32RegClassID:
31606 case X86::FR32XRegClassID:
31607 case X86::FR64RegClassID:
31608 case X86::FR64XRegClassID:
31609 case X86::FR128RegClassID:
31610 case X86::VR64RegClassID:
31611 case X86::VR128RegClassID:
31612 case X86::VR128LRegClassID:
31613 case X86::VR128HRegClassID:
31614 case X86::VR128XRegClassID:
31615 case X86::VR256RegClassID:
31616 case X86::VR256LRegClassID:
31617 case X86::VR256HRegClassID:
31618 case X86::VR256XRegClassID:
31619 case X86::VR512RegClassID:
31626 std::pair<unsigned, const TargetRegisterClass *>
31627 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
31628 StringRef Constraint,
31630 // First, see if this is a constraint that directly corresponds to an LLVM
31632 if (Constraint.size() == 1) {
31633 // GCC Constraint Letters
31634 switch (Constraint[0]) {
31636 // TODO: Slight differences here in allocation order and leaving
31637 // RIP in the class. Do they matter any more here than they do
31638 // in the normal allocation?
31639 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
31640 if (Subtarget.is64Bit()) {
31641 if (VT == MVT::i32 || VT == MVT::f32)
31642 return std::make_pair(0U, &X86::GR32RegClass);
31643 if (VT == MVT::i16)
31644 return std::make_pair(0U, &X86::GR16RegClass);
31645 if (VT == MVT::i8 || VT == MVT::i1)
31646 return std::make_pair(0U, &X86::GR8RegClass);
31647 if (VT == MVT::i64 || VT == MVT::f64)
31648 return std::make_pair(0U, &X86::GR64RegClass);
31651 // 32-bit fallthrough
31652 case 'Q': // Q_REGS
31653 if (VT == MVT::i32 || VT == MVT::f32)
31654 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
31655 if (VT == MVT::i16)
31656 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
31657 if (VT == MVT::i8 || VT == MVT::i1)
31658 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
31659 if (VT == MVT::i64)
31660 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
31662 case 'r': // GENERAL_REGS
31663 case 'l': // INDEX_REGS
31664 if (VT == MVT::i8 || VT == MVT::i1)
31665 return std::make_pair(0U, &X86::GR8RegClass);
31666 if (VT == MVT::i16)
31667 return std::make_pair(0U, &X86::GR16RegClass);
31668 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
31669 return std::make_pair(0U, &X86::GR32RegClass);
31670 return std::make_pair(0U, &X86::GR64RegClass);
31671 case 'R': // LEGACY_REGS
31672 if (VT == MVT::i8 || VT == MVT::i1)
31673 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
31674 if (VT == MVT::i16)
31675 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
31676 if (VT == MVT::i32 || !Subtarget.is64Bit())
31677 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
31678 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
31679 case 'f': // FP Stack registers.
31680 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
31681 // value to the correct fpstack register class.
31682 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
31683 return std::make_pair(0U, &X86::RFP32RegClass);
31684 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
31685 return std::make_pair(0U, &X86::RFP64RegClass);
31686 return std::make_pair(0U, &X86::RFP80RegClass);
31687 case 'y': // MMX_REGS if MMX allowed.
31688 if (!Subtarget.hasMMX()) break;
31689 return std::make_pair(0U, &X86::VR64RegClass);
31690 case 'Y': // SSE_REGS if SSE2 allowed
31691 if (!Subtarget.hasSSE2()) break;
31693 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
31694 if (!Subtarget.hasSSE1()) break;
31696 switch (VT.SimpleTy) {
31698 // Scalar SSE types.
31701 return std::make_pair(0U, &X86::FR32RegClass);
31704 return std::make_pair(0U, &X86::FR64RegClass);
31705 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31713 return std::make_pair(0U, &X86::VR128RegClass);
31721 return std::make_pair(0U, &X86::VR256RegClass);
31726 return std::make_pair(0U, &X86::VR512RegClass);
31732 // Use the default implementation in TargetLowering to convert the register
31733 // constraint into a member of a register class.
31734 std::pair<unsigned, const TargetRegisterClass*> Res;
31735 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
31737 // Not found as a standard register?
31739 // Map st(0) -> st(7) -> ST0
31740 if (Constraint.size() == 7 && Constraint[0] == '{' &&
31741 tolower(Constraint[1]) == 's' &&
31742 tolower(Constraint[2]) == 't' &&
31743 Constraint[3] == '(' &&
31744 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
31745 Constraint[5] == ')' &&
31746 Constraint[6] == '}') {
31748 Res.first = X86::FP0+Constraint[4]-'0';
31749 Res.second = &X86::RFP80RegClass;
31753 // GCC allows "st(0)" to be called just plain "st".
31754 if (StringRef("{st}").equals_lower(Constraint)) {
31755 Res.first = X86::FP0;
31756 Res.second = &X86::RFP80RegClass;
31761 if (StringRef("{flags}").equals_lower(Constraint)) {
31762 Res.first = X86::EFLAGS;
31763 Res.second = &X86::CCRRegClass;
31767 // 'A' means EAX + EDX.
31768 if (Constraint == "A") {
31769 Res.first = X86::EAX;
31770 Res.second = &X86::GR32_ADRegClass;
31776 // Otherwise, check to see if this is a register class of the wrong value
31777 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
31778 // turn into {ax},{dx}.
31779 // MVT::Other is used to specify clobber names.
31780 if (Res.second->hasType(VT) || VT == MVT::Other)
31781 return Res; // Correct type already, nothing to do.
31783 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
31784 // return "eax". This should even work for things like getting 64bit integer
31785 // registers when given an f64 type.
31786 const TargetRegisterClass *Class = Res.second;
31787 // The generic code will match the first register class that contains the
31788 // given register. Thus, based on the ordering of the tablegened file,
31789 // the "plain" GR classes might not come first.
31790 // Therefore, use a helper method.
31791 if (isGRClass(*Class)) {
31792 unsigned Size = VT.getSizeInBits();
31793 if (Size == 1) Size = 8;
31794 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
31796 Res.first = DestReg;
31797 Res.second = Size == 8 ? &X86::GR8RegClass
31798 : Size == 16 ? &X86::GR16RegClass
31799 : Size == 32 ? &X86::GR32RegClass
31800 : &X86::GR64RegClass;
31801 assert(Res.second->contains(Res.first) && "Register in register class");
31803 // No register found/type mismatch.
31805 Res.second = nullptr;
31807 } else if (isFRClass(*Class)) {
31808 // Handle references to XMM physical registers that got mapped into the
31809 // wrong class. This can happen with constraints like {xmm0} where the
31810 // target independent register mapper will just pick the first match it can
31811 // find, ignoring the required type.
31813 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31814 if (VT == MVT::f32 || VT == MVT::i32)
31815 Res.second = &X86::FR32RegClass;
31816 else if (VT == MVT::f64 || VT == MVT::i64)
31817 Res.second = &X86::FR64RegClass;
31818 else if (X86::VR128RegClass.hasType(VT))
31819 Res.second = &X86::VR128RegClass;
31820 else if (X86::VR256RegClass.hasType(VT))
31821 Res.second = &X86::VR256RegClass;
31822 else if (X86::VR512RegClass.hasType(VT))
31823 Res.second = &X86::VR512RegClass;
31825 // Type mismatch and not a clobber: Return an error;
31827 Res.second = nullptr;
31834 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
31835 const AddrMode &AM, Type *Ty,
31836 unsigned AS) const {
31837 // Scaling factors are not free at all.
31838 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
31839 // will take 2 allocations in the out of order engine instead of 1
31840 // for plain addressing mode, i.e. inst (reg1).
31842 // vaddps (%rsi,%drx), %ymm0, %ymm1
31843 // Requires two allocations (one for the load, one for the computation)
31845 // vaddps (%rsi), %ymm0, %ymm1
31846 // Requires just 1 allocation, i.e., freeing allocations for other operations
31847 // and having less micro operations to execute.
31849 // For some X86 architectures, this is even worse because for instance for
31850 // stores, the complex addressing mode forces the instruction to use the
31851 // "load" ports instead of the dedicated "store" port.
31852 // E.g., on Haswell:
31853 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
31854 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
31855 if (isLegalAddressingMode(DL, AM, Ty, AS))
31856 // Scale represents reg2 * scale, thus account for 1
31857 // as soon as we use a second register.
31858 return AM.Scale != 0;
31862 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
31863 // Integer division on x86 is expensive. However, when aggressively optimizing
31864 // for code size, we prefer to use a div instruction, as it is usually smaller
31865 // than the alternative sequence.
31866 // The exception to this is vector division. Since x86 doesn't have vector
31867 // integer division, leaving the division as-is is a loss even in terms of
31868 // size, because it will have to be scalarized, while the alternative code
31869 // sequence can be performed in vector form.
31870 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
31871 Attribute::MinSize);
31872 return OptSize && !VT.isVector();
31875 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
31876 if (!Subtarget.is64Bit())
31879 // Update IsSplitCSR in X86MachineFunctionInfo.
31880 X86MachineFunctionInfo *AFI =
31881 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
31882 AFI->setIsSplitCSR(true);
31885 void X86TargetLowering::insertCopiesSplitCSR(
31886 MachineBasicBlock *Entry,
31887 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
31888 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
31889 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
31893 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31894 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
31895 MachineBasicBlock::iterator MBBI = Entry->begin();
31896 for (const MCPhysReg *I = IStart; *I; ++I) {
31897 const TargetRegisterClass *RC = nullptr;
31898 if (X86::GR64RegClass.contains(*I))
31899 RC = &X86::GR64RegClass;
31901 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
31903 unsigned NewVR = MRI->createVirtualRegister(RC);
31904 // Create copy from CSR to a virtual register.
31905 // FIXME: this currently does not emit CFI pseudo-instructions, it works
31906 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
31907 // nounwind. If we want to generalize this later, we may need to emit
31908 // CFI pseudo-instructions.
31909 assert(Entry->getParent()->getFunction()->hasFnAttribute(
31910 Attribute::NoUnwind) &&
31911 "Function should be nounwind in insertCopiesSplitCSR!");
31912 Entry->addLiveIn(*I);
31913 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
31916 // Insert the copy-back instructions right before the terminator.
31917 for (auto *Exit : Exits)
31918 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
31919 TII->get(TargetOpcode::COPY), *I)