1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/MathExtras.h"
56 #include "llvm/Target/TargetOptions.h"
63 #define DEBUG_TYPE "x86-isel"
65 STATISTIC(NumTailCalls, "Number of tail calls");
67 static cl::opt<bool> ExperimentalVectorWideningLegalization(
68 "x86-experimental-vector-widening-legalization", cl::init(false),
69 cl::desc("Enable an experimental vector type legalization through widening "
70 "rather than promotion."),
73 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
74 const X86Subtarget &STI)
75 : TargetLowering(TM), Subtarget(STI) {
76 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
77 X86ScalarSSEf64 = Subtarget.hasSSE2();
78 X86ScalarSSEf32 = Subtarget.hasSSE1();
79 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
81 // Set up the TargetLowering object.
83 // X86 is weird. It always uses i8 for shift amounts and setcc results.
84 setBooleanContents(ZeroOrOneBooleanContent);
85 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
86 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
88 // For 64-bit, since we have so many registers, use the ILP scheduler.
89 // For 32-bit, use the register pressure specific scheduling.
90 // For Atom, always use ILP scheduling.
91 if (Subtarget.isAtom())
92 setSchedulingPreference(Sched::ILP);
93 else if (Subtarget.is64Bit())
94 setSchedulingPreference(Sched::ILP);
96 setSchedulingPreference(Sched::RegPressure);
97 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
98 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
100 // Bypass expensive divides on Atom when compiling with O2.
101 if (TM.getOptLevel() >= CodeGenOpt::Default) {
102 if (Subtarget.hasSlowDivide32())
103 addBypassSlowDiv(32, 8);
104 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
105 addBypassSlowDiv(64, 16);
108 if (Subtarget.isTargetKnownWindowsMSVC() ||
109 Subtarget.isTargetWindowsItanium()) {
110 // Setup Windows compiler runtime calls.
111 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
112 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
113 setLibcallName(RTLIB::SREM_I64, "_allrem");
114 setLibcallName(RTLIB::UREM_I64, "_aullrem");
115 setLibcallName(RTLIB::MUL_I64, "_allmul");
116 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
117 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
118 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
119 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
120 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
123 if (Subtarget.isTargetDarwin()) {
124 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
125 setUseUnderscoreSetJmp(false);
126 setUseUnderscoreLongJmp(false);
127 } else if (Subtarget.isTargetWindowsGNU()) {
128 // MS runtime is weird: it exports _setjmp, but longjmp!
129 setUseUnderscoreSetJmp(true);
130 setUseUnderscoreLongJmp(false);
132 setUseUnderscoreSetJmp(true);
133 setUseUnderscoreLongJmp(true);
136 // Set up the register classes.
137 addRegisterClass(MVT::i8, &X86::GR8RegClass);
138 addRegisterClass(MVT::i16, &X86::GR16RegClass);
139 addRegisterClass(MVT::i32, &X86::GR32RegClass);
140 if (Subtarget.is64Bit())
141 addRegisterClass(MVT::i64, &X86::GR64RegClass);
143 for (MVT VT : MVT::integer_valuetypes())
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
146 // We don't accept any truncstore of integer registers.
147 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
148 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
149 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
150 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
151 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
152 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
154 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
156 // SETOEQ and SETUNE require checking two conditions.
157 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
158 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
159 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
160 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
161 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
162 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
164 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
166 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
167 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
168 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
170 if (Subtarget.is64Bit()) {
171 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
172 // f32/f64 are legal, f80 is custom.
173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
175 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
177 } else if (!Subtarget.useSoftFloat()) {
178 // We have an algorithm for SSE2->double, and we turn this into a
179 // 64-bit FILD followed by conditional FADD for other targets.
180 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
181 // We have an algorithm for SSE2, and we turn this into a 64-bit
182 // FILD or VCVTUSI2SS/SD for other targets.
183 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
186 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
188 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
189 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
191 if (!Subtarget.useSoftFloat()) {
192 // SSE has no i16 to fp conversion, only i32.
193 if (X86ScalarSSEf32) {
194 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
195 // f32 and f64 cases are Legal, f80 case is not
196 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
198 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
199 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
203 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
206 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
208 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
209 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
211 if (!Subtarget.useSoftFloat()) {
212 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
213 // are Legal, f80 is custom lowered.
214 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
215 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
217 if (X86ScalarSSEf32) {
218 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
219 // f32 and f64 cases are Legal, f80 case is not
220 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
222 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
223 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
227 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
228 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
231 // Handle FP_TO_UINT by promoting the destination to a larger signed
233 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
234 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
235 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
237 if (Subtarget.is64Bit()) {
238 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
239 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
240 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
241 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
244 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
246 } else if (!Subtarget.useSoftFloat()) {
247 // Since AVX is a superset of SSE3, only check for SSE here.
248 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
249 // Expand FP_TO_UINT into a select.
250 // FIXME: We would like to use a Custom expander here eventually to do
251 // the optimal thing for SSE vs. the default expansion in the legalizer.
252 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
254 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
255 // With SSE3 we can use fisttpll to convert to a signed i64; without
256 // SSE, we're stuck with a fistpll.
257 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
262 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
263 if (!X86ScalarSSEf64) {
264 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
265 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
266 if (Subtarget.is64Bit()) {
267 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
268 // Without SSE, i64->f64 goes through memory.
269 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
271 } else if (!Subtarget.is64Bit())
272 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
274 // Scalar integer divide and remainder are lowered to use operations that
275 // produce two results, to match the available instructions. This exposes
276 // the two-result form to trivial CSE, which is able to combine x/y and x%y
277 // into a single instruction.
279 // Scalar integer multiply-high is also lowered to use two-result
280 // operations, to match the available instructions. However, plain multiply
281 // (low) operations are left as Legal, as there are single-result
282 // instructions for this in x86. Using the two-result multiply instructions
283 // when both high and low results are needed must be arranged by dagcombine.
284 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
285 setOperationAction(ISD::MULHS, VT, Expand);
286 setOperationAction(ISD::MULHU, VT, Expand);
287 setOperationAction(ISD::SDIV, VT, Expand);
288 setOperationAction(ISD::UDIV, VT, Expand);
289 setOperationAction(ISD::SREM, VT, Expand);
290 setOperationAction(ISD::UREM, VT, Expand);
293 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
294 if (VT == MVT::i64 && !Subtarget.is64Bit())
296 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
297 setOperationAction(ISD::ADDC, VT, Custom);
298 setOperationAction(ISD::ADDE, VT, Custom);
299 setOperationAction(ISD::SUBC, VT, Custom);
300 setOperationAction(ISD::SUBE, VT, Custom);
303 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
304 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
305 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
306 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
307 setOperationAction(ISD::BR_CC, VT, Expand);
308 setOperationAction(ISD::SELECT_CC, VT, Expand);
310 if (Subtarget.is64Bit())
311 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
312 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
313 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
314 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
315 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
317 setOperationAction(ISD::FREM , MVT::f32 , Expand);
318 setOperationAction(ISD::FREM , MVT::f64 , Expand);
319 setOperationAction(ISD::FREM , MVT::f80 , Expand);
320 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
322 // Promote the i8 variants and force them on up to i32 which has a shorter
324 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
325 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
326 if (!Subtarget.hasBMI()) {
327 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
328 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
329 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
330 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
331 if (Subtarget.is64Bit()) {
332 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
333 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
337 if (Subtarget.hasLZCNT()) {
338 // When promoting the i8 variants, force them to i32 for a shorter
340 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
341 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
344 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
347 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
348 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
349 if (Subtarget.is64Bit()) {
350 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
351 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355 // Special handling for half-precision floating point conversions.
356 // If we don't have F16C support, then lower half float conversions
357 // into library calls.
358 if (Subtarget.useSoftFloat() ||
359 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
360 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
361 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
364 // There's never any support for operations beyond MVT::f32.
365 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
366 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
367 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
368 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
370 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
371 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
372 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
373 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
374 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
375 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
377 if (Subtarget.hasPOPCNT()) {
378 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
380 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
381 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
382 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
383 if (Subtarget.is64Bit())
384 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
387 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
389 if (!Subtarget.hasMOVBE())
390 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
392 // These should be promoted to a larger select which is supported.
393 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
394 // X86 wants to expand cmov itself.
395 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
396 setOperationAction(ISD::SELECT, VT, Custom);
397 setOperationAction(ISD::SETCC, VT, Custom);
399 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
400 if (VT == MVT::i64 && !Subtarget.is64Bit())
402 setOperationAction(ISD::SELECT, VT, Custom);
403 setOperationAction(ISD::SETCC, VT, Custom);
404 setOperationAction(ISD::SETCCE, VT, Custom);
406 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
407 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
408 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
409 // support continuation, user-level threading, and etc.. As a result, no
410 // other SjLj exception interfaces are implemented and please don't build
411 // your own exception handling based on them.
412 // LLVM/Clang supports zero-cost DWARF exception handling.
413 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
414 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
415 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
416 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
417 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
420 for (auto VT : { MVT::i32, MVT::i64 }) {
421 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 setOperationAction(ISD::ConstantPool , VT, Custom);
424 setOperationAction(ISD::JumpTable , VT, Custom);
425 setOperationAction(ISD::GlobalAddress , VT, Custom);
426 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
427 setOperationAction(ISD::ExternalSymbol , VT, Custom);
428 setOperationAction(ISD::BlockAddress , VT, Custom);
430 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
431 for (auto VT : { MVT::i32, MVT::i64 }) {
432 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 setOperationAction(ISD::SHL_PARTS, VT, Custom);
435 setOperationAction(ISD::SRA_PARTS, VT, Custom);
436 setOperationAction(ISD::SRL_PARTS, VT, Custom);
439 if (Subtarget.hasSSE1())
440 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
442 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
444 // Expand certain atomics
445 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
446 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
447 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
448 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
449 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
450 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
451 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
452 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
455 if (Subtarget.hasCmpxchg16b()) {
456 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
459 // FIXME - use subtarget debug flags
460 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
461 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
462 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
463 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
466 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
467 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
469 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
470 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
472 setOperationAction(ISD::TRAP, MVT::Other, Legal);
473 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
475 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
476 setOperationAction(ISD::VASTART , MVT::Other, Custom);
477 setOperationAction(ISD::VAEND , MVT::Other, Expand);
478 bool Is64Bit = Subtarget.is64Bit();
479 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
480 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
482 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
483 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
485 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
487 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
488 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
489 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
491 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
492 // f32 and f64 use SSE.
493 // Set up the FP register classes.
494 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
495 : &X86::FR32RegClass);
496 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
497 : &X86::FR64RegClass);
499 for (auto VT : { MVT::f32, MVT::f64 }) {
500 // Use ANDPD to simulate FABS.
501 setOperationAction(ISD::FABS, VT, Custom);
503 // Use XORP to simulate FNEG.
504 setOperationAction(ISD::FNEG, VT, Custom);
506 // Use ANDPD and ORPD to simulate FCOPYSIGN.
507 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
509 // We don't support sin/cos/fmod
510 setOperationAction(ISD::FSIN , VT, Expand);
511 setOperationAction(ISD::FCOS , VT, Expand);
512 setOperationAction(ISD::FSINCOS, VT, Expand);
515 // Lower this to MOVMSK plus an AND.
516 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
517 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
519 // Expand FP immediates into loads from the stack, except for the special
521 addLegalFPImmediate(APFloat(+0.0)); // xorpd
522 addLegalFPImmediate(APFloat(+0.0f)); // xorps
523 } else if (UseX87 && X86ScalarSSEf32) {
524 // Use SSE for f32, x87 for f64.
525 // Set up the FP register classes.
526 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
527 : &X86::FR32RegClass);
528 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
530 // Use ANDPS to simulate FABS.
531 setOperationAction(ISD::FABS , MVT::f32, Custom);
533 // Use XORP to simulate FNEG.
534 setOperationAction(ISD::FNEG , MVT::f32, Custom);
536 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
538 // Use ANDPS and ORPS to simulate FCOPYSIGN.
539 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
540 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
542 // We don't support sin/cos/fmod
543 setOperationAction(ISD::FSIN , MVT::f32, Expand);
544 setOperationAction(ISD::FCOS , MVT::f32, Expand);
545 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
547 // Special cases we handle for FP constants.
548 addLegalFPImmediate(APFloat(+0.0f)); // xorps
549 addLegalFPImmediate(APFloat(+0.0)); // FLD0
550 addLegalFPImmediate(APFloat(+1.0)); // FLD1
551 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
552 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
554 if (!TM.Options.UnsafeFPMath) {
555 setOperationAction(ISD::FSIN , MVT::f64, Expand);
556 setOperationAction(ISD::FCOS , MVT::f64, Expand);
557 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
560 // f32 and f64 in x87.
561 // Set up the FP register classes.
562 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
563 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
565 for (auto VT : { MVT::f32, MVT::f64 }) {
566 setOperationAction(ISD::UNDEF, VT, Expand);
567 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
569 if (!TM.Options.UnsafeFPMath) {
570 setOperationAction(ISD::FSIN , VT, Expand);
571 setOperationAction(ISD::FCOS , VT, Expand);
572 setOperationAction(ISD::FSINCOS, VT, Expand);
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
579 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
580 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
581 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
582 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
585 // We don't support FMA.
586 setOperationAction(ISD::FMA, MVT::f64, Expand);
587 setOperationAction(ISD::FMA, MVT::f32, Expand);
589 // Long double always uses X87, except f128 in MMX.
591 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
592 addRegisterClass(MVT::f128, &X86::FR128RegClass);
593 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
594 setOperationAction(ISD::FABS , MVT::f128, Custom);
595 setOperationAction(ISD::FNEG , MVT::f128, Custom);
596 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
599 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
600 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
601 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
603 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
604 addLegalFPImmediate(TmpFlt); // FLD0
606 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
609 APFloat TmpFlt2(+1.0);
610 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
612 addLegalFPImmediate(TmpFlt2); // FLD1
613 TmpFlt2.changeSign();
614 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
617 if (!TM.Options.UnsafeFPMath) {
618 setOperationAction(ISD::FSIN , MVT::f80, Expand);
619 setOperationAction(ISD::FCOS , MVT::f80, Expand);
620 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
623 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
624 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
625 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
626 setOperationAction(ISD::FRINT, MVT::f80, Expand);
627 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
628 setOperationAction(ISD::FMA, MVT::f80, Expand);
631 // Always use a library call for pow.
632 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
633 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
634 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
636 setOperationAction(ISD::FLOG, MVT::f80, Expand);
637 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
638 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
639 setOperationAction(ISD::FEXP, MVT::f80, Expand);
640 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
641 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
642 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
644 // Some FP actions are always expanded for vector types.
645 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
646 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
647 setOperationAction(ISD::FSIN, VT, Expand);
648 setOperationAction(ISD::FSINCOS, VT, Expand);
649 setOperationAction(ISD::FCOS, VT, Expand);
650 setOperationAction(ISD::FREM, VT, Expand);
651 setOperationAction(ISD::FPOWI, VT, Expand);
652 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
653 setOperationAction(ISD::FPOW, VT, Expand);
654 setOperationAction(ISD::FLOG, VT, Expand);
655 setOperationAction(ISD::FLOG2, VT, Expand);
656 setOperationAction(ISD::FLOG10, VT, Expand);
657 setOperationAction(ISD::FEXP, VT, Expand);
658 setOperationAction(ISD::FEXP2, VT, Expand);
661 // First set operation action for all vector types to either promote
662 // (for widening) or expand (for scalarization). Then we will selectively
663 // turn on ones that can be effectively codegen'd.
664 for (MVT VT : MVT::vector_valuetypes()) {
665 setOperationAction(ISD::SDIV, VT, Expand);
666 setOperationAction(ISD::UDIV, VT, Expand);
667 setOperationAction(ISD::SREM, VT, Expand);
668 setOperationAction(ISD::UREM, VT, Expand);
669 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
670 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
671 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
672 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
673 setOperationAction(ISD::FMA, VT, Expand);
674 setOperationAction(ISD::FFLOOR, VT, Expand);
675 setOperationAction(ISD::FCEIL, VT, Expand);
676 setOperationAction(ISD::FTRUNC, VT, Expand);
677 setOperationAction(ISD::FRINT, VT, Expand);
678 setOperationAction(ISD::FNEARBYINT, VT, Expand);
679 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
680 setOperationAction(ISD::MULHS, VT, Expand);
681 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
682 setOperationAction(ISD::MULHU, VT, Expand);
683 setOperationAction(ISD::SDIVREM, VT, Expand);
684 setOperationAction(ISD::UDIVREM, VT, Expand);
685 setOperationAction(ISD::CTPOP, VT, Expand);
686 setOperationAction(ISD::CTTZ, VT, Expand);
687 setOperationAction(ISD::CTLZ, VT, Expand);
688 setOperationAction(ISD::ROTL, VT, Expand);
689 setOperationAction(ISD::ROTR, VT, Expand);
690 setOperationAction(ISD::BSWAP, VT, Expand);
691 setOperationAction(ISD::SETCC, VT, Expand);
692 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
693 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
694 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
695 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
696 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
697 setOperationAction(ISD::TRUNCATE, VT, Expand);
698 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
699 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
700 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
701 setOperationAction(ISD::SELECT_CC, VT, Expand);
702 for (MVT InnerVT : MVT::vector_valuetypes()) {
703 setTruncStoreAction(InnerVT, VT, Expand);
705 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
706 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
708 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
709 // types, we have to deal with them whether we ask for Expansion or not.
710 // Setting Expand causes its own optimisation problems though, so leave
712 if (VT.getVectorElementType() == MVT::i1)
713 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
715 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
716 // split/scalarized right now.
717 if (VT.getVectorElementType() == MVT::f16)
718 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
722 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
723 // with -msoft-float, disable use of MMX as well.
724 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
725 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
726 // No operations on x86mmx supported, everything uses intrinsics.
729 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
733 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
734 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
735 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
736 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
737 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
738 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
739 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
740 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
741 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
744 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
745 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
746 : &X86::VR128RegClass);
748 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
749 // registers cannot be used even for integer operations.
750 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
752 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
754 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
755 : &X86::VR128RegClass);
756 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
757 : &X86::VR128RegClass);
759 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
760 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
761 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
762 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
763 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
764 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
765 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
766 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
767 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
768 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
769 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
770 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
771 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
773 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
774 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
775 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
776 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
778 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
779 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
780 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
781 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
783 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
784 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
787 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
789 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
790 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
791 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
792 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
794 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
795 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
796 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
797 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
799 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
800 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
801 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
802 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
803 setOperationAction(ISD::VSELECT, VT, Custom);
804 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
807 // We support custom legalizing of sext and anyext loads for specific
808 // memory vector types which we can load as a scalar (or sequence of
809 // scalars) and extend in-register to a legal 128-bit vector type. For sext
810 // loads these must work with a single scalar load.
811 for (MVT VT : MVT::integer_vector_valuetypes()) {
812 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
813 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
814 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
815 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
816 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
817 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
818 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
819 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
820 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
823 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
824 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
825 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
826 setOperationAction(ISD::VSELECT, VT, Custom);
828 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
831 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
832 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
835 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
836 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
837 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
838 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
839 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
840 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
841 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
844 // Custom lower v2i64 and v2f64 selects.
845 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
846 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
848 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
849 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
851 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
852 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
854 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
855 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
856 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
858 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
859 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
861 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
862 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
864 for (MVT VT : MVT::fp_vector_valuetypes())
865 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
867 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
868 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
869 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
871 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
872 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
873 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
875 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
876 setOperationAction(ISD::SRL, VT, Custom);
877 setOperationAction(ISD::SHL, VT, Custom);
878 setOperationAction(ISD::SRA, VT, Custom);
881 // In the customized shift lowering, the legal cases in AVX2 will be
883 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
884 setOperationAction(ISD::SRL, VT, Custom);
885 setOperationAction(ISD::SHL, VT, Custom);
886 setOperationAction(ISD::SRA, VT, Custom);
890 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
891 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
892 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
893 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
894 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
895 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
898 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
899 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
900 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
901 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
902 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
903 setOperationAction(ISD::FRINT, RoundedTy, Legal);
904 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
907 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
908 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
909 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
910 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
911 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
916 // FIXME: Do we need to handle scalar-to-vector here?
917 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
919 // We directly match byte blends in the backend as they match the VSELECT
921 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
923 // SSE41 brings specific instructions for doing vector sign extend even in
924 // cases where we don't have SRA.
925 for (MVT VT : MVT::integer_vector_valuetypes()) {
926 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
927 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
928 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
931 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
932 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
933 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
934 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
935 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
936 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
937 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
939 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
940 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
941 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
942 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
943 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
944 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
946 // i8 vectors are custom because the source register and source
947 // source memory operand types are not the same width.
948 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
951 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
952 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
953 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
954 setOperationAction(ISD::ROTL, VT, Custom);
956 // XOP can efficiently perform BITREVERSE with VPPERM.
957 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
958 setOperationAction(ISD::BITREVERSE, VT, Custom);
960 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
961 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
962 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
966 bool HasInt256 = Subtarget.hasInt256();
968 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
969 : &X86::VR256RegClass);
970 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
971 : &X86::VR256RegClass);
972 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
973 : &X86::VR256RegClass);
974 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975 : &X86::VR256RegClass);
976 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
977 : &X86::VR256RegClass);
978 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979 : &X86::VR256RegClass);
981 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
982 setOperationAction(ISD::FFLOOR, VT, Legal);
983 setOperationAction(ISD::FCEIL, VT, Legal);
984 setOperationAction(ISD::FTRUNC, VT, Legal);
985 setOperationAction(ISD::FRINT, VT, Legal);
986 setOperationAction(ISD::FNEARBYINT, VT, Legal);
987 setOperationAction(ISD::FNEG, VT, Custom);
988 setOperationAction(ISD::FABS, VT, Custom);
989 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
992 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
993 // even though v8i16 is a legal type.
994 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
995 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
996 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
998 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
999 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1000 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1002 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1003 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1005 for (MVT VT : MVT::fp_vector_valuetypes())
1006 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1008 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
1009 setOperationAction(ISD::SRL, VT, Custom);
1010 setOperationAction(ISD::SHL, VT, Custom);
1011 setOperationAction(ISD::SRA, VT, Custom);
1014 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1015 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1016 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1017 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1020 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1021 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1023 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1024 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1025 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1026 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1027 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1028 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1029 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1030 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1031 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1032 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1033 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1035 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1037 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1038 setOperationAction(ISD::CTPOP, VT, Custom);
1039 setOperationAction(ISD::CTTZ, VT, Custom);
1040 setOperationAction(ISD::CTLZ, VT, Custom);
1043 if (Subtarget.hasAnyFMA()) {
1044 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1045 MVT::v2f64, MVT::v4f64 })
1046 setOperationAction(ISD::FMA, VT, Legal);
1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1050 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1055 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1057 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1059 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1060 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1062 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1063 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1065 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1067 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1068 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1069 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1070 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1076 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1077 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1079 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1080 // when we have a 256bit-wide blend with immediate.
1081 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1083 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1084 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1085 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1086 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1087 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1088 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1089 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1091 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1092 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1093 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1094 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1095 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1096 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1099 // In the customized shift lowering, the legal cases in AVX2 will be
1101 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1102 setOperationAction(ISD::SRL, VT, Custom);
1103 setOperationAction(ISD::SHL, VT, Custom);
1104 setOperationAction(ISD::SRA, VT, Custom);
1107 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1108 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1109 setOperationAction(ISD::MLOAD, VT, Legal);
1110 setOperationAction(ISD::MSTORE, VT, Legal);
1113 // Extract subvector is special because the value type
1114 // (result) is 128-bit but the source is 256-bit wide.
1115 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1116 MVT::v4f32, MVT::v2f64 }) {
1117 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1120 // Custom lower several nodes for 256-bit types.
1121 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1122 MVT::v8f32, MVT::v4f64 }) {
1123 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1124 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1125 setOperationAction(ISD::VSELECT, VT, Custom);
1126 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1127 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1128 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1129 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1130 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1134 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1136 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1137 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1138 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1139 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1140 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1141 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1142 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1146 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1147 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1148 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1149 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1150 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1152 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1153 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1154 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1156 for (MVT VT : MVT::fp_vector_valuetypes())
1157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1159 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1160 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1161 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1162 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1163 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1164 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1165 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1167 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1168 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1169 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1170 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1171 setOperationAction(ISD::XOR, MVT::i1, Legal);
1172 setOperationAction(ISD::OR, MVT::i1, Legal);
1173 setOperationAction(ISD::AND, MVT::i1, Legal);
1174 setOperationAction(ISD::SUB, MVT::i1, Custom);
1175 setOperationAction(ISD::ADD, MVT::i1, Custom);
1176 setOperationAction(ISD::MUL, MVT::i1, Custom);
1178 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1179 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1180 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1181 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1182 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1183 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1184 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1185 setTruncStoreAction(VT, MaskVT, Custom);
1188 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1189 setOperationAction(ISD::FNEG, VT, Custom);
1190 setOperationAction(ISD::FABS, VT, Custom);
1191 setOperationAction(ISD::FMA, VT, Legal);
1192 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1195 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1196 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1197 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1199 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1200 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1201 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1202 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1203 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1204 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1205 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1206 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1207 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1208 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1209 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1210 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1211 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1212 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1213 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1214 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1215 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1216 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1217 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1218 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1219 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1221 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1222 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1223 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1224 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1225 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1226 if (Subtarget.hasVLX()){
1227 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1228 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1229 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1230 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1231 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1233 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1234 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1235 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1236 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1237 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1239 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1240 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1241 setOperationAction(ISD::MLOAD, VT, Custom);
1242 setOperationAction(ISD::MSTORE, VT, Custom);
1245 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1246 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1248 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1249 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1251 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1252 if (Subtarget.hasDQI()) {
1253 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1254 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1255 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1256 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1257 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1258 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1259 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1260 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1261 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1262 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1263 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1264 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1266 if (Subtarget.hasVLX()) {
1267 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1268 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1269 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1270 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1273 if (Subtarget.hasVLX()) {
1274 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1275 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1276 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1278 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1279 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1280 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1281 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1282 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1284 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1285 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1286 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1287 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1288 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1289 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1290 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1291 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1292 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1293 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1294 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1297 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1298 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1299 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1300 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1301 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1302 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1303 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1304 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1305 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1306 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1309 if (Subtarget.hasDQI()) {
1310 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1311 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1313 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1314 setOperationAction(ISD::FFLOOR, VT, Legal);
1315 setOperationAction(ISD::FCEIL, VT, Legal);
1316 setOperationAction(ISD::FTRUNC, VT, Legal);
1317 setOperationAction(ISD::FRINT, VT, Legal);
1318 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1321 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1324 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1325 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1326 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1329 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1330 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1331 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1332 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1334 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1335 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1337 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1339 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1340 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1341 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1342 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1343 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1344 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1345 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1346 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1347 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1348 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1349 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1350 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1352 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1353 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1354 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1355 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1356 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1357 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1358 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1359 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1361 setOperationAction(ISD::ADD, MVT::v8i1, Expand);
1362 setOperationAction(ISD::ADD, MVT::v16i1, Expand);
1363 setOperationAction(ISD::SUB, MVT::v8i1, Expand);
1364 setOperationAction(ISD::SUB, MVT::v16i1, Expand);
1365 setOperationAction(ISD::MUL, MVT::v8i1, Expand);
1366 setOperationAction(ISD::MUL, MVT::v16i1, Expand);
1368 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1370 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1371 setOperationAction(ISD::SRL, VT, Custom);
1372 setOperationAction(ISD::SHL, VT, Custom);
1373 setOperationAction(ISD::SRA, VT, Custom);
1374 setOperationAction(ISD::CTPOP, VT, Custom);
1375 setOperationAction(ISD::CTTZ, VT, Custom);
1378 // Need to promote to 64-bit even though we have 32-bit masked instructions
1379 // because the IR optimizers rearrange bitcasts around logic ops leaving
1380 // too many variations to handle if we don't promote them.
1381 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1382 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1383 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1385 if (Subtarget.hasCDI()) {
1386 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1387 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1389 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1390 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1391 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1392 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1394 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1395 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1397 if (Subtarget.hasVLX()) {
1398 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1399 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1400 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1401 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1403 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1404 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1405 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1406 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1409 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1410 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1411 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1412 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1413 } // Subtarget.hasCDI()
1415 if (Subtarget.hasDQI()) {
1416 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1417 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1418 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1419 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1422 // Custom lower several nodes.
1423 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1424 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1425 setOperationAction(ISD::MGATHER, VT, Custom);
1426 setOperationAction(ISD::MSCATTER, VT, Custom);
1428 // Extract subvector is special because the value type
1429 // (result) is 256-bit but the source is 512-bit wide.
1430 // 128-bit was made Custom under AVX1.
1431 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1432 MVT::v8f32, MVT::v4f64 })
1433 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1434 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1435 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1436 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1438 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1439 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1440 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1441 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1442 setOperationAction(ISD::VSELECT, VT, Legal);
1443 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1444 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1445 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1446 setOperationAction(ISD::MLOAD, VT, Legal);
1447 setOperationAction(ISD::MSTORE, VT, Legal);
1448 setOperationAction(ISD::MGATHER, VT, Legal);
1449 setOperationAction(ISD::MSCATTER, VT, Custom);
1451 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1452 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1453 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1457 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1458 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1459 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1461 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1462 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1464 setOperationAction(ISD::ADD, MVT::v32i1, Expand);
1465 setOperationAction(ISD::ADD, MVT::v64i1, Expand);
1466 setOperationAction(ISD::SUB, MVT::v32i1, Expand);
1467 setOperationAction(ISD::SUB, MVT::v64i1, Expand);
1468 setOperationAction(ISD::MUL, MVT::v32i1, Expand);
1469 setOperationAction(ISD::MUL, MVT::v64i1, Expand);
1471 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1472 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1473 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1474 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1475 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1476 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1477 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1478 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1479 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1480 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1481 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1482 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1483 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
1484 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
1485 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1486 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1487 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1489 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1490 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1491 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1492 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1493 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1494 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1495 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1496 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1497 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1498 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1499 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1500 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1501 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1502 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1503 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1505 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1506 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1507 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1508 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1509 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1510 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1511 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1512 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1513 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1514 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1515 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1516 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1517 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1519 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1520 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1521 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1522 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1523 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1524 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1525 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1526 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1528 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1530 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1531 if (Subtarget.hasVLX()) {
1532 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1533 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1536 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1537 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1538 setOperationAction(ISD::MLOAD, VT, Action);
1539 setOperationAction(ISD::MSTORE, VT, Action);
1542 if (Subtarget.hasCDI()) {
1543 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1544 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1547 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1548 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1549 setOperationAction(ISD::VSELECT, VT, Legal);
1550 setOperationAction(ISD::SRL, VT, Custom);
1551 setOperationAction(ISD::SHL, VT, Custom);
1552 setOperationAction(ISD::SRA, VT, Custom);
1553 setOperationAction(ISD::MLOAD, VT, Legal);
1554 setOperationAction(ISD::MSTORE, VT, Legal);
1555 setOperationAction(ISD::CTPOP, VT, Custom);
1556 setOperationAction(ISD::CTTZ, VT, Custom);
1558 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1559 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1560 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1563 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1564 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1565 if (Subtarget.hasVLX()) {
1566 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1567 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1568 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1573 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1574 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1575 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1577 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1578 setOperationAction(ISD::ADD, VT, Expand);
1579 setOperationAction(ISD::SUB, VT, Expand);
1580 setOperationAction(ISD::MUL, VT, Expand);
1581 setOperationAction(ISD::VSELECT, VT, Expand);
1583 setOperationAction(ISD::TRUNCATE, VT, Custom);
1584 setOperationAction(ISD::SETCC, VT, Custom);
1585 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1586 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1587 setOperationAction(ISD::SELECT, VT, Custom);
1588 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1589 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1592 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1593 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1594 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1595 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1597 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1598 setOperationAction(ISD::SMAX, VT, Legal);
1599 setOperationAction(ISD::UMAX, VT, Legal);
1600 setOperationAction(ISD::SMIN, VT, Legal);
1601 setOperationAction(ISD::UMIN, VT, Legal);
1605 // We want to custom lower some of our intrinsics.
1606 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1607 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1608 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1609 if (!Subtarget.is64Bit()) {
1610 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1611 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1614 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1615 // handle type legalization for these operations here.
1617 // FIXME: We really should do custom legalization for addition and
1618 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1619 // than generic legalization for 64-bit multiplication-with-overflow, though.
1620 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1621 if (VT == MVT::i64 && !Subtarget.is64Bit())
1623 // Add/Sub/Mul with overflow operations are custom lowered.
1624 setOperationAction(ISD::SADDO, VT, Custom);
1625 setOperationAction(ISD::UADDO, VT, Custom);
1626 setOperationAction(ISD::SSUBO, VT, Custom);
1627 setOperationAction(ISD::USUBO, VT, Custom);
1628 setOperationAction(ISD::SMULO, VT, Custom);
1629 setOperationAction(ISD::UMULO, VT, Custom);
1632 if (!Subtarget.is64Bit()) {
1633 // These libcalls are not available in 32-bit.
1634 setLibcallName(RTLIB::SHL_I128, nullptr);
1635 setLibcallName(RTLIB::SRL_I128, nullptr);
1636 setLibcallName(RTLIB::SRA_I128, nullptr);
1639 // Combine sin / cos into one node or libcall if possible.
1640 if (Subtarget.hasSinCos()) {
1641 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1642 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1643 if (Subtarget.isTargetDarwin()) {
1644 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1645 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1646 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1647 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1651 if (Subtarget.isTargetWin64()) {
1652 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1653 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1654 setOperationAction(ISD::SREM, MVT::i128, Custom);
1655 setOperationAction(ISD::UREM, MVT::i128, Custom);
1656 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1657 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1660 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1661 // is. We should promote the value to 64-bits to solve this.
1662 // This is what the CRT headers do - `fmodf` is an inline header
1663 // function casting to f64 and calling `fmod`.
1664 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1665 Subtarget.isTargetWindowsItanium()))
1666 for (ISD::NodeType Op :
1667 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1668 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1669 if (isOperationExpand(Op, MVT::f32))
1670 setOperationAction(Op, MVT::f32, Promote);
1672 // We have target-specific dag combine patterns for the following nodes:
1673 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1674 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1675 setTargetDAGCombine(ISD::BITCAST);
1676 setTargetDAGCombine(ISD::VSELECT);
1677 setTargetDAGCombine(ISD::SELECT);
1678 setTargetDAGCombine(ISD::SHL);
1679 setTargetDAGCombine(ISD::SRA);
1680 setTargetDAGCombine(ISD::SRL);
1681 setTargetDAGCombine(ISD::OR);
1682 setTargetDAGCombine(ISD::AND);
1683 setTargetDAGCombine(ISD::ADD);
1684 setTargetDAGCombine(ISD::FADD);
1685 setTargetDAGCombine(ISD::FSUB);
1686 setTargetDAGCombine(ISD::FNEG);
1687 setTargetDAGCombine(ISD::FMA);
1688 setTargetDAGCombine(ISD::FMINNUM);
1689 setTargetDAGCombine(ISD::FMAXNUM);
1690 setTargetDAGCombine(ISD::SUB);
1691 setTargetDAGCombine(ISD::LOAD);
1692 setTargetDAGCombine(ISD::MLOAD);
1693 setTargetDAGCombine(ISD::STORE);
1694 setTargetDAGCombine(ISD::MSTORE);
1695 setTargetDAGCombine(ISD::TRUNCATE);
1696 setTargetDAGCombine(ISD::ZERO_EXTEND);
1697 setTargetDAGCombine(ISD::ANY_EXTEND);
1698 setTargetDAGCombine(ISD::SIGN_EXTEND);
1699 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1700 setTargetDAGCombine(ISD::SINT_TO_FP);
1701 setTargetDAGCombine(ISD::UINT_TO_FP);
1702 setTargetDAGCombine(ISD::SETCC);
1703 setTargetDAGCombine(ISD::MUL);
1704 setTargetDAGCombine(ISD::XOR);
1705 setTargetDAGCombine(ISD::MSCATTER);
1706 setTargetDAGCombine(ISD::MGATHER);
1708 computeRegisterProperties(Subtarget.getRegisterInfo());
1710 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1711 MaxStoresPerMemsetOptSize = 8;
1712 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1713 MaxStoresPerMemcpyOptSize = 4;
1714 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1715 MaxStoresPerMemmoveOptSize = 4;
1716 setPrefLoopAlignment(4); // 2^4 bytes.
1718 // An out-of-order CPU can speculatively execute past a predictable branch,
1719 // but a conditional move could be stalled by an expensive earlier operation.
1720 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1721 EnableExtLdPromotion = true;
1722 setPrefFunctionAlignment(4); // 2^4 bytes.
1724 verifyIntrinsicTables();
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734 if (ExperimentalVectorWideningLegalization &&
1735 VT.getVectorNumElements() != 1 &&
1736 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737 return TypeWidenVector;
1739 return TargetLoweringBase::getPreferredVectorAction(VT);
1742 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1743 LLVMContext& Context,
1746 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1748 if (VT.isSimple()) {
1749 MVT VVT = VT.getSimpleVT();
1750 const unsigned NumElts = VVT.getVectorNumElements();
1751 MVT EltVT = VVT.getVectorElementType();
1752 if (VVT.is512BitVector()) {
1753 if (Subtarget.hasAVX512())
1754 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1755 EltVT == MVT::f32 || EltVT == MVT::f64)
1757 case 8: return MVT::v8i1;
1758 case 16: return MVT::v16i1;
1760 if (Subtarget.hasBWI())
1761 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1763 case 32: return MVT::v32i1;
1764 case 64: return MVT::v64i1;
1768 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1769 return MVT::getVectorVT(MVT::i1, NumElts);
1771 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1772 EVT LegalVT = getTypeToTransformTo(Context, VT);
1773 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1776 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1778 case 2: return MVT::v2i1;
1779 case 4: return MVT::v4i1;
1780 case 8: return MVT::v8i1;
1784 return VT.changeVectorElementTypeToInteger();
1787 /// Helper for getByValTypeAlignment to determine
1788 /// the desired ByVal argument alignment.
1789 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1792 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1793 if (VTy->getBitWidth() == 128)
1795 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1796 unsigned EltAlign = 0;
1797 getMaxByValAlign(ATy->getElementType(), EltAlign);
1798 if (EltAlign > MaxAlign)
1799 MaxAlign = EltAlign;
1800 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1801 for (auto *EltTy : STy->elements()) {
1802 unsigned EltAlign = 0;
1803 getMaxByValAlign(EltTy, EltAlign);
1804 if (EltAlign > MaxAlign)
1805 MaxAlign = EltAlign;
1812 /// Return the desired alignment for ByVal aggregate
1813 /// function arguments in the caller parameter area. For X86, aggregates
1814 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1815 /// are at 4-byte boundaries.
1816 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1817 const DataLayout &DL) const {
1818 if (Subtarget.is64Bit()) {
1819 // Max of 8 and alignment of type.
1820 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1827 if (Subtarget.hasSSE1())
1828 getMaxByValAlign(Ty, Align);
1832 /// Returns the target specific optimal type for load
1833 /// and store operations as a result of memset, memcpy, and memmove
1834 /// lowering. If DstAlign is zero that means it's safe to destination
1835 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1836 /// means there isn't a need to check it against alignment requirement,
1837 /// probably because the source does not need to be loaded. If 'IsMemset' is
1838 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1839 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1840 /// source is constant so it does not need to be loaded.
1841 /// It returns EVT::Other if the type should be determined using generic
1842 /// target-independent logic.
1844 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1845 unsigned DstAlign, unsigned SrcAlign,
1846 bool IsMemset, bool ZeroMemset,
1848 MachineFunction &MF) const {
1849 const Function *F = MF.getFunction();
1850 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1852 (!Subtarget.isUnalignedMem16Slow() ||
1853 ((DstAlign == 0 || DstAlign >= 16) &&
1854 (SrcAlign == 0 || SrcAlign >= 16)))) {
1855 // FIXME: Check if unaligned 32-byte accesses are slow.
1856 if (Size >= 32 && Subtarget.hasAVX()) {
1857 // Although this isn't a well-supported type for AVX1, we'll let
1858 // legalization and shuffle lowering produce the optimal codegen. If we
1859 // choose an optimal type with a vector element larger than a byte,
1860 // getMemsetStores() may create an intermediate splat (using an integer
1861 // multiply) before we splat as a vector.
1864 if (Subtarget.hasSSE2())
1866 // TODO: Can SSE1 handle a byte vector?
1867 if (Subtarget.hasSSE1())
1869 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1870 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1871 // Do not use f64 to lower memcpy if source is string constant. It's
1872 // better to use i32 to avoid the loads.
1873 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1874 // The gymnastics of splatting a byte value into an XMM register and then
1875 // only using 8-byte stores (because this is a CPU with slow unaligned
1876 // 16-byte accesses) makes that a loser.
1880 // This is a compromise. If we reach here, unaligned accesses may be slow on
1881 // this target. However, creating smaller, aligned accesses could be even
1882 // slower and would certainly be a lot more code.
1883 if (Subtarget.is64Bit() && Size >= 8)
1888 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1890 return X86ScalarSSEf32;
1891 else if (VT == MVT::f64)
1892 return X86ScalarSSEf64;
1897 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1902 switch (VT.getSizeInBits()) {
1904 // 8-byte and under are always assumed to be fast.
1908 *Fast = !Subtarget.isUnalignedMem16Slow();
1911 *Fast = !Subtarget.isUnalignedMem32Slow();
1913 // TODO: What about AVX-512 (512-bit) accesses?
1916 // Misaligned accesses of any size are always allowed.
1920 /// Return the entry encoding for a jump table in the
1921 /// current function. The returned value is a member of the
1922 /// MachineJumpTableInfo::JTEntryKind enum.
1923 unsigned X86TargetLowering::getJumpTableEncoding() const {
1924 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1926 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1927 return MachineJumpTableInfo::EK_Custom32;
1929 // Otherwise, use the normal jump table encoding heuristics.
1930 return TargetLowering::getJumpTableEncoding();
1933 bool X86TargetLowering::useSoftFloat() const {
1934 return Subtarget.useSoftFloat();
1938 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1939 const MachineBasicBlock *MBB,
1940 unsigned uid,MCContext &Ctx) const{
1941 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1942 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1944 return MCSymbolRefExpr::create(MBB->getSymbol(),
1945 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1948 /// Returns relocation base for the given PIC jumptable.
1949 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1950 SelectionDAG &DAG) const {
1951 if (!Subtarget.is64Bit())
1952 // This doesn't have SDLoc associated with it, but is not really the
1953 // same as a Register.
1954 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1955 getPointerTy(DAG.getDataLayout()));
1959 /// This returns the relocation base for the given PIC jumptable,
1960 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1961 const MCExpr *X86TargetLowering::
1962 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1963 MCContext &Ctx) const {
1964 // X86-64 uses RIP relative addressing based on the jump table label.
1965 if (Subtarget.isPICStyleRIPRel())
1966 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1968 // Otherwise, the reference is relative to the PIC base.
1969 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1972 std::pair<const TargetRegisterClass *, uint8_t>
1973 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1975 const TargetRegisterClass *RRC = nullptr;
1977 switch (VT.SimpleTy) {
1979 return TargetLowering::findRepresentativeClass(TRI, VT);
1980 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1981 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1984 RRC = &X86::VR64RegClass;
1986 case MVT::f32: case MVT::f64:
1987 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1988 case MVT::v4f32: case MVT::v2f64:
1989 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1990 case MVT::v8f32: case MVT::v4f64:
1991 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1992 case MVT::v16f32: case MVT::v8f64:
1993 RRC = &X86::VR128XRegClass;
1996 return std::make_pair(RRC, Cost);
1999 unsigned X86TargetLowering::getAddressSpace() const {
2000 if (Subtarget.is64Bit())
2001 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2005 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2006 // glibc has a special slot for the stack guard in tcbhead_t, use it instead
2007 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
2008 if (!Subtarget.isTargetGlibc())
2009 return TargetLowering::getIRStackGuard(IRB);
2011 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
2013 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2014 unsigned AddressSpace = getAddressSpace();
2015 return ConstantExpr::getIntToPtr(
2016 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2017 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2020 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2021 // MSVC CRT provides functionalities for stack protection.
2022 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2023 // MSVC CRT has a global variable holding security cookie.
2024 M.getOrInsertGlobal("__security_cookie",
2025 Type::getInt8PtrTy(M.getContext()));
2027 // MSVC CRT has a function to validate security cookie.
2028 auto *SecurityCheckCookie = cast<Function>(
2029 M.getOrInsertFunction("__security_check_cookie",
2030 Type::getVoidTy(M.getContext()),
2031 Type::getInt8PtrTy(M.getContext()), nullptr));
2032 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2033 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2036 // glibc has a special slot for the stack guard.
2037 if (Subtarget.isTargetGlibc())
2039 TargetLowering::insertSSPDeclarations(M);
2042 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2043 // MSVC CRT has a global variable holding security cookie.
2044 if (Subtarget.getTargetTriple().isOSMSVCRT())
2045 return M.getGlobalVariable("__security_cookie");
2046 return TargetLowering::getSDagStackGuard(M);
2049 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2050 // MSVC CRT has a function to validate security cookie.
2051 if (Subtarget.getTargetTriple().isOSMSVCRT())
2052 return M.getFunction("__security_check_cookie");
2053 return TargetLowering::getSSPStackGuardCheck(M);
2056 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2057 if (Subtarget.getTargetTriple().isOSContiki())
2058 return getDefaultSafeStackPointerLocation(IRB, false);
2060 if (!Subtarget.isTargetAndroid())
2061 return TargetLowering::getSafeStackPointerLocation(IRB);
2063 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2064 // definition of TLS_SLOT_SAFESTACK in
2065 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2066 unsigned AddressSpace, Offset;
2068 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2070 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2071 AddressSpace = getAddressSpace();
2072 return ConstantExpr::getIntToPtr(
2073 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2074 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2077 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2078 unsigned DestAS) const {
2079 assert(SrcAS != DestAS && "Expected different address spaces!");
2081 return SrcAS < 256 && DestAS < 256;
2084 //===----------------------------------------------------------------------===//
2085 // Return Value Calling Convention Implementation
2086 //===----------------------------------------------------------------------===//
2088 #include "X86GenCallingConv.inc"
2090 bool X86TargetLowering::CanLowerReturn(
2091 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2092 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2093 SmallVector<CCValAssign, 16> RVLocs;
2094 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2095 return CCInfo.CheckReturn(Outs, RetCC_X86);
2098 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2099 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2103 /// Lowers masks values (v*i1) to the local register values
2104 /// \returns DAG node after lowering to register type
2105 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2106 const SDLoc &Dl, SelectionDAG &DAG) {
2107 EVT ValVT = ValArg.getValueType();
2109 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2110 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2111 // Two stage lowering might be required
2112 // bitcast: v8i1 -> i8 / v16i1 -> i16
2113 // anyextend: i8 -> i32 / i16 -> i32
2114 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2115 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2116 if (ValLoc == MVT::i32)
2117 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2119 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2120 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2121 // One stage lowering is required
2122 // bitcast: v32i1 -> i32 / v64i1 -> i64
2123 return DAG.getBitcast(ValLoc, ValArg);
2125 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2128 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2129 static void Passv64i1ArgInRegs(
2130 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2131 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2132 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2133 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2134 "Expected AVX512BW or AVX512BMI target!");
2135 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2136 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2137 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2138 "The value should reside in two registers");
2140 // Before splitting the value we cast it to i64
2141 Arg = DAG.getBitcast(MVT::i64, Arg);
2143 // Splitting the value into two i32 types
2145 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2146 DAG.getConstant(0, Dl, MVT::i32));
2147 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2148 DAG.getConstant(1, Dl, MVT::i32));
2150 // Attach the two i32 types into corresponding registers
2151 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2152 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2156 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2158 const SmallVectorImpl<ISD::OutputArg> &Outs,
2159 const SmallVectorImpl<SDValue> &OutVals,
2160 const SDLoc &dl, SelectionDAG &DAG) const {
2161 MachineFunction &MF = DAG.getMachineFunction();
2162 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2164 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2165 report_fatal_error("X86 interrupts may not return any value");
2167 SmallVector<CCValAssign, 16> RVLocs;
2168 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2169 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2172 SmallVector<SDValue, 6> RetOps;
2173 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2174 // Operand #1 = Bytes To Pop
2175 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2178 // Copy the result values into the output registers.
2179 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2181 CCValAssign &VA = RVLocs[I];
2182 assert(VA.isRegLoc() && "Can only return in registers!");
2183 SDValue ValToCopy = OutVals[OutsIndex];
2184 EVT ValVT = ValToCopy.getValueType();
2186 // Promote values to the appropriate types.
2187 if (VA.getLocInfo() == CCValAssign::SExt)
2188 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2189 else if (VA.getLocInfo() == CCValAssign::ZExt)
2190 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2191 else if (VA.getLocInfo() == CCValAssign::AExt) {
2192 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2193 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2195 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2197 else if (VA.getLocInfo() == CCValAssign::BCvt)
2198 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2200 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2201 "Unexpected FP-extend for return value.");
2203 // If this is x86-64, and we disabled SSE, we can't return FP values,
2204 // or SSE or MMX vectors.
2205 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2206 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2207 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2208 report_fatal_error("SSE register return with SSE disabled");
2210 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2211 // llvm-gcc has never done it right and no one has noticed, so this
2212 // should be OK for now.
2213 if (ValVT == MVT::f64 &&
2214 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2215 report_fatal_error("SSE2 register return with SSE2 disabled");
2217 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2218 // the RET instruction and handled by the FP Stackifier.
2219 if (VA.getLocReg() == X86::FP0 ||
2220 VA.getLocReg() == X86::FP1) {
2221 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2222 // change the value to the FP stack register class.
2223 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2224 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2225 RetOps.push_back(ValToCopy);
2226 // Don't emit a copytoreg.
2230 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2231 // which is returned in RAX / RDX.
2232 if (Subtarget.is64Bit()) {
2233 if (ValVT == MVT::x86mmx) {
2234 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2235 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2236 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2238 // If we don't have SSE2 available, convert to v4f32 so the generated
2239 // register is legal.
2240 if (!Subtarget.hasSSE2())
2241 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2246 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2248 if (VA.needsCustom()) {
2249 assert(VA.getValVT() == MVT::v64i1 &&
2250 "Currently the only custom case is when we split v64i1 to 2 regs");
2252 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2255 assert(2 == RegsToPass.size() &&
2256 "Expecting two registers after Pass64BitArgInRegs");
2258 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2261 // Add nodes to the DAG and add the values into the RetOps list
2262 for (auto &Reg : RegsToPass) {
2263 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2264 Flag = Chain.getValue(1);
2265 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2269 // Swift calling convention does not require we copy the sret argument
2270 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2272 // All x86 ABIs require that for returning structs by value we copy
2273 // the sret argument into %rax/%eax (depending on ABI) for the return.
2274 // We saved the argument into a virtual register in the entry block,
2275 // so now we copy the value out and into %rax/%eax.
2277 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2278 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2279 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2280 // either case FuncInfo->setSRetReturnReg() will have been called.
2281 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2282 // When we have both sret and another return value, we should use the
2283 // original Chain stored in RetOps[0], instead of the current Chain updated
2284 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2286 // For the case of sret and another return value, we have
2287 // Chain_0 at the function entry
2288 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2289 // If we use Chain_1 in getCopyFromReg, we will have
2290 // Val = getCopyFromReg(Chain_1)
2291 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2293 // getCopyToReg(Chain_0) will be glued together with
2294 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2295 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2296 // Data dependency from Unit B to Unit A due to usage of Val in
2297 // getCopyToReg(Chain_1, Val)
2298 // Chain dependency from Unit A to Unit B
2300 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2301 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2302 getPointerTy(MF.getDataLayout()));
2305 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2306 X86::RAX : X86::EAX;
2307 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2308 Flag = Chain.getValue(1);
2310 // RAX/EAX now acts like a return value.
2312 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2315 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2316 const MCPhysReg *I =
2317 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2320 if (X86::GR64RegClass.contains(*I))
2321 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2323 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2327 RetOps[0] = Chain; // Update chain.
2329 // Add the flag if we have it.
2331 RetOps.push_back(Flag);
2333 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2334 if (CallConv == CallingConv::X86_INTR)
2335 opcode = X86ISD::IRET;
2336 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2339 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2340 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2343 SDValue TCChain = Chain;
2344 SDNode *Copy = *N->use_begin();
2345 if (Copy->getOpcode() == ISD::CopyToReg) {
2346 // If the copy has a glue operand, we conservatively assume it isn't safe to
2347 // perform a tail call.
2348 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2350 TCChain = Copy->getOperand(0);
2351 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2354 bool HasRet = false;
2355 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2357 if (UI->getOpcode() != X86ISD::RET_FLAG)
2359 // If we are returning more than one value, we can definitely
2360 // not make a tail call see PR19530
2361 if (UI->getNumOperands() > 4)
2363 if (UI->getNumOperands() == 4 &&
2364 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2376 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2377 ISD::NodeType ExtendKind) const {
2378 MVT ReturnMVT = MVT::i32;
2380 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2381 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2382 // The ABI does not require i1, i8 or i16 to be extended.
2384 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2385 // always extending i8/i16 return values, so keep doing that for now.
2387 ReturnMVT = MVT::i8;
2390 EVT MinVT = getRegisterType(Context, ReturnMVT);
2391 return VT.bitsLT(MinVT) ? MinVT : VT;
2394 /// Reads two 32 bit registers and creates a 64 bit mask value.
2395 /// \param VA The current 32 bit value that need to be assigned.
2396 /// \param NextVA The next 32 bit value that need to be assigned.
2397 /// \param Root The parent DAG node.
2398 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2399 /// glue purposes. In the case the DAG is already using
2400 /// physical register instead of virtual, we should glue
2401 /// our new SDValue to InFlag SDvalue.
2402 /// \return a new SDvalue of size 64bit.
2403 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2404 SDValue &Root, SelectionDAG &DAG,
2405 const SDLoc &Dl, const X86Subtarget &Subtarget,
2406 SDValue *InFlag = nullptr) {
2407 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2408 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2409 assert(VA.getValVT() == MVT::v64i1 &&
2410 "Expecting first location of 64 bit width type");
2411 assert(NextVA.getValVT() == VA.getValVT() &&
2412 "The locations should have the same type");
2413 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2414 "The values should reside in two registers");
2418 SDValue ArgValueLo, ArgValueHi;
2420 MachineFunction &MF = DAG.getMachineFunction();
2421 const TargetRegisterClass *RC = &X86::GR32RegClass;
2423 // Read a 32 bit value from the registers
2424 if (nullptr == InFlag) {
2425 // When no physical register is present,
2426 // create an intermediate virtual register
2427 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2428 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2429 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2430 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2432 // When a physical register is available read the value from it and glue
2433 // the reads together.
2435 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2436 *InFlag = ArgValueLo.getValue(2);
2438 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2439 *InFlag = ArgValueHi.getValue(2);
2442 // Convert the i32 type into v32i1 type
2443 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2445 // Convert the i32 type into v32i1 type
2446 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2448 // Concantenate the two values together
2449 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2452 /// The function will lower a register of various sizes (8/16/32/64)
2453 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2454 /// \returns a DAG node contains the operand after lowering to mask type.
2455 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2456 const EVT &ValLoc, const SDLoc &Dl,
2457 SelectionDAG &DAG) {
2458 SDValue ValReturned = ValArg;
2460 if (ValVT == MVT::v64i1) {
2461 // In 32 bit machine, this case is handled by getv64i1Argument
2462 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2463 // In 64 bit machine, There is no need to truncate the value only bitcast
2466 switch (ValVT.getSimpleVT().SimpleTy) {
2477 llvm_unreachable("Expecting a vector of i1 types");
2480 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2483 return DAG.getBitcast(ValVT, ValReturned);
2486 /// Lower the result values of a call into the
2487 /// appropriate copies out of appropriate physical registers.
2489 SDValue X86TargetLowering::LowerCallResult(
2490 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2491 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2492 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2494 // Assign locations to each value returned by this call.
2495 SmallVector<CCValAssign, 16> RVLocs;
2496 bool Is64Bit = Subtarget.is64Bit();
2497 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2499 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2501 // Copy all of the result registers out of their specified physreg.
2502 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2504 CCValAssign &VA = RVLocs[I];
2505 EVT CopyVT = VA.getLocVT();
2507 // If this is x86-64, and we disabled SSE, we can't return FP values
2508 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2509 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2510 report_fatal_error("SSE register return with SSE disabled");
2513 // If we prefer to use the value in xmm registers, copy it out as f80 and
2514 // use a truncate to move it from fp stack reg to xmm reg.
2515 bool RoundAfterCopy = false;
2516 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2517 isScalarFPTypeInSSEReg(VA.getValVT())) {
2518 if (!Subtarget.hasX87())
2519 report_fatal_error("X87 register return with X87 disabled");
2521 RoundAfterCopy = (CopyVT != VA.getLocVT());
2525 if (VA.needsCustom()) {
2526 assert(VA.getValVT() == MVT::v64i1 &&
2527 "Currently the only custom case is when we split v64i1 to 2 regs");
2529 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2531 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2533 Val = Chain.getValue(0);
2534 InFlag = Chain.getValue(2);
2538 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2539 // This truncation won't change the value.
2540 DAG.getIntPtrConstant(1, dl));
2542 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2543 if (VA.getValVT().isVector() &&
2544 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2545 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2546 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2547 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2549 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2552 InVals.push_back(Val);
2558 //===----------------------------------------------------------------------===//
2559 // C & StdCall & Fast Calling Convention implementation
2560 //===----------------------------------------------------------------------===//
2561 // StdCall calling convention seems to be standard for many Windows' API
2562 // routines and around. It differs from C calling convention just a little:
2563 // callee should clean up the stack, not caller. Symbols should be also
2564 // decorated in some fancy way :) It doesn't support any vector arguments.
2565 // For info on fast calling convention see Fast Calling Convention (tail call)
2566 // implementation LowerX86_32FastCCCallTo.
2568 /// CallIsStructReturn - Determines whether a call uses struct return
2570 enum StructReturnType {
2575 static StructReturnType
2576 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2578 return NotStructReturn;
2580 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2581 if (!Flags.isSRet())
2582 return NotStructReturn;
2583 if (Flags.isInReg() || IsMCU)
2584 return RegStructReturn;
2585 return StackStructReturn;
2588 /// Determines whether a function uses struct return semantics.
2589 static StructReturnType
2590 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2592 return NotStructReturn;
2594 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2595 if (!Flags.isSRet())
2596 return NotStructReturn;
2597 if (Flags.isInReg() || IsMCU)
2598 return RegStructReturn;
2599 return StackStructReturn;
2602 /// Make a copy of an aggregate at address specified by "Src" to address
2603 /// "Dst" with size and alignment information specified by the specific
2604 /// parameter attribute. The copy will be passed as a byval function parameter.
2605 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2606 SDValue Chain, ISD::ArgFlagsTy Flags,
2607 SelectionDAG &DAG, const SDLoc &dl) {
2608 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2610 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2611 /*isVolatile*/false, /*AlwaysInline=*/true,
2612 /*isTailCall*/false,
2613 MachinePointerInfo(), MachinePointerInfo());
2616 /// Return true if the calling convention is one that we can guarantee TCO for.
2617 static bool canGuaranteeTCO(CallingConv::ID CC) {
2618 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2619 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2620 CC == CallingConv::HHVM);
2623 /// Return true if we might ever do TCO for calls with this calling convention.
2624 static bool mayTailCallThisCC(CallingConv::ID CC) {
2626 // C calling conventions:
2627 case CallingConv::C:
2628 case CallingConv::X86_64_Win64:
2629 case CallingConv::X86_64_SysV:
2630 // Callee pop conventions:
2631 case CallingConv::X86_ThisCall:
2632 case CallingConv::X86_StdCall:
2633 case CallingConv::X86_VectorCall:
2634 case CallingConv::X86_FastCall:
2637 return canGuaranteeTCO(CC);
2641 /// Return true if the function is being made into a tailcall target by
2642 /// changing its ABI.
2643 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2644 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2647 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2649 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2650 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2654 CallingConv::ID CalleeCC = CS.getCallingConv();
2655 if (!mayTailCallThisCC(CalleeCC))
2662 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2663 const SmallVectorImpl<ISD::InputArg> &Ins,
2664 const SDLoc &dl, SelectionDAG &DAG,
2665 const CCValAssign &VA,
2666 MachineFrameInfo &MFI, unsigned i) const {
2667 // Create the nodes corresponding to a load from this parameter slot.
2668 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2669 bool AlwaysUseMutable = shouldGuaranteeTCO(
2670 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2671 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2674 // If value is passed by pointer we have address passed instead of the value
2675 // itself. No need to extend if the mask value and location share the same
2677 bool ExtendedInMem =
2678 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2679 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2681 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2682 ValVT = VA.getLocVT();
2684 ValVT = VA.getValVT();
2686 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2687 // taken by a return address.
2689 if (CallConv == CallingConv::X86_INTR) {
2690 const X86Subtarget& Subtarget =
2691 static_cast<const X86Subtarget&>(DAG.getSubtarget());
2692 // X86 interrupts may take one or two arguments.
2693 // On the stack there will be no return address as in regular call.
2694 // Offset of last argument need to be set to -4/-8 bytes.
2695 // Where offset of the first argument out of two, should be set to 0 bytes.
2696 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2699 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2700 // changed with more analysis.
2701 // In case of tail call optimization mark all arguments mutable. Since they
2702 // could be overwritten by lowering of arguments in case of a tail call.
2703 if (Flags.isByVal()) {
2704 unsigned Bytes = Flags.getByValSize();
2705 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2706 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2707 // Adjust SP offset of interrupt parameter.
2708 if (CallConv == CallingConv::X86_INTR) {
2709 MFI.setObjectOffset(FI, Offset);
2711 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2713 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
2714 VA.getLocMemOffset(), isImmutable);
2716 // Set SExt or ZExt flag.
2717 if (VA.getLocInfo() == CCValAssign::ZExt) {
2718 MFI.setObjectZExt(FI, true);
2719 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2720 MFI.setObjectSExt(FI, true);
2723 // Adjust SP offset of interrupt parameter.
2724 if (CallConv == CallingConv::X86_INTR) {
2725 MFI.setObjectOffset(FI, Offset);
2728 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2729 SDValue Val = DAG.getLoad(
2730 ValVT, dl, Chain, FIN,
2731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2732 return ExtendedInMem ?
2733 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2737 // FIXME: Get this from tablegen.
2738 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2739 const X86Subtarget &Subtarget) {
2740 assert(Subtarget.is64Bit());
2742 if (Subtarget.isCallingConvWin64(CallConv)) {
2743 static const MCPhysReg GPR64ArgRegsWin64[] = {
2744 X86::RCX, X86::RDX, X86::R8, X86::R9
2746 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2749 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2750 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2752 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2755 // FIXME: Get this from tablegen.
2756 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2757 CallingConv::ID CallConv,
2758 const X86Subtarget &Subtarget) {
2759 assert(Subtarget.is64Bit());
2760 if (Subtarget.isCallingConvWin64(CallConv)) {
2761 // The XMM registers which might contain var arg parameters are shadowed
2762 // in their paired GPR. So we only need to save the GPR to their home
2764 // TODO: __vectorcall will change this.
2768 const Function *Fn = MF.getFunction();
2769 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2770 bool isSoftFloat = Subtarget.useSoftFloat();
2771 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2772 "SSE register cannot be used when SSE is disabled!");
2773 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2774 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2778 static const MCPhysReg XMMArgRegs64Bit[] = {
2779 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2780 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2782 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2785 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2786 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2787 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2788 return A.getValNo() < B.getValNo();
2792 SDValue X86TargetLowering::LowerFormalArguments(
2793 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2794 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2795 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2796 MachineFunction &MF = DAG.getMachineFunction();
2797 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2798 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2800 const Function *Fn = MF.getFunction();
2801 if (Fn->hasExternalLinkage() &&
2802 Subtarget.isTargetCygMing() &&
2803 Fn->getName() == "main")
2804 FuncInfo->setForceFramePointer(true);
2806 MachineFrameInfo &MFI = MF.getFrameInfo();
2807 bool Is64Bit = Subtarget.is64Bit();
2808 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2811 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2812 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2814 if (CallConv == CallingConv::X86_INTR) {
2815 bool isLegal = Ins.size() == 1 ||
2816 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2817 (!Is64Bit && Ins[1].VT == MVT::i32)));
2819 report_fatal_error("X86 interrupts may take one or two arguments");
2822 // Assign locations to all of the incoming arguments.
2823 SmallVector<CCValAssign, 16> ArgLocs;
2824 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2826 // Allocate shadow area for Win64.
2828 CCInfo.AllocateStack(32, 8);
2830 CCInfo.AnalyzeArguments(Ins, CC_X86);
2832 // In vectorcall calling convention a second pass is required for the HVA
2834 if (CallingConv::X86_VectorCall == CallConv) {
2835 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2838 // The next loop assumes that the locations are in the same order of the
2840 if (!isSortedByValueNo(ArgLocs))
2841 llvm_unreachable("Argument Location list must be sorted before lowering");
2844 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2846 assert(InsIndex < Ins.size() && "Invalid Ins index");
2847 CCValAssign &VA = ArgLocs[I];
2849 if (VA.isRegLoc()) {
2850 EVT RegVT = VA.getLocVT();
2851 if (VA.needsCustom()) {
2853 VA.getValVT() == MVT::v64i1 &&
2854 "Currently the only custom case is when we split v64i1 to 2 regs");
2856 // v64i1 values, in regcall calling convention, that are
2857 // compiled to 32 bit arch, are splited up into two registers.
2859 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2861 const TargetRegisterClass *RC;
2862 if (RegVT == MVT::i32)
2863 RC = &X86::GR32RegClass;
2864 else if (Is64Bit && RegVT == MVT::i64)
2865 RC = &X86::GR64RegClass;
2866 else if (RegVT == MVT::f32)
2867 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2868 else if (RegVT == MVT::f64)
2869 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2870 else if (RegVT == MVT::f80)
2871 RC = &X86::RFP80RegClass;
2872 else if (RegVT == MVT::f128)
2873 RC = &X86::FR128RegClass;
2874 else if (RegVT.is512BitVector())
2875 RC = &X86::VR512RegClass;
2876 else if (RegVT.is256BitVector())
2877 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2878 else if (RegVT.is128BitVector())
2879 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2880 else if (RegVT == MVT::x86mmx)
2881 RC = &X86::VR64RegClass;
2882 else if (RegVT == MVT::i1)
2883 RC = &X86::VK1RegClass;
2884 else if (RegVT == MVT::v8i1)
2885 RC = &X86::VK8RegClass;
2886 else if (RegVT == MVT::v16i1)
2887 RC = &X86::VK16RegClass;
2888 else if (RegVT == MVT::v32i1)
2889 RC = &X86::VK32RegClass;
2890 else if (RegVT == MVT::v64i1)
2891 RC = &X86::VK64RegClass;
2893 llvm_unreachable("Unknown argument type!");
2895 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2896 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2899 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2900 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2902 if (VA.getLocInfo() == CCValAssign::SExt)
2903 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2904 DAG.getValueType(VA.getValVT()));
2905 else if (VA.getLocInfo() == CCValAssign::ZExt)
2906 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2907 DAG.getValueType(VA.getValVT()));
2908 else if (VA.getLocInfo() == CCValAssign::BCvt)
2909 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2911 if (VA.isExtInLoc()) {
2912 // Handle MMX values passed in XMM regs.
2913 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2914 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2915 else if (VA.getValVT().isVector() &&
2916 VA.getValVT().getScalarType() == MVT::i1 &&
2917 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2918 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2919 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2920 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2922 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2925 assert(VA.isMemLoc());
2927 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
2930 // If value is passed via pointer - do a load.
2931 if (VA.getLocInfo() == CCValAssign::Indirect)
2933 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
2935 InVals.push_back(ArgValue);
2938 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
2939 // Swift calling convention does not require we copy the sret argument
2940 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2941 if (CallConv == CallingConv::Swift)
2944 // All x86 ABIs require that for returning structs by value we copy the
2945 // sret argument into %rax/%eax (depending on ABI) for the return. Save
2946 // the argument into a virtual register so that we can access it from the
2948 if (Ins[I].Flags.isSRet()) {
2949 unsigned Reg = FuncInfo->getSRetReturnReg();
2951 MVT PtrTy = getPointerTy(DAG.getDataLayout());
2952 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2953 FuncInfo->setSRetReturnReg(Reg);
2955 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
2956 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2961 unsigned StackSize = CCInfo.getNextStackOffset();
2962 // Align stack specially for tail calls.
2963 if (shouldGuaranteeTCO(CallConv,
2964 MF.getTarget().Options.GuaranteedTailCallOpt))
2965 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2967 // If the function takes variable number of arguments, make a frame index for
2968 // the start of the first vararg value... for expansion of llvm.va_start. We
2969 // can skip this if there are no va_start calls.
2970 if (MFI.hasVAStart() &&
2971 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2972 CallConv != CallingConv::X86_ThisCall))) {
2973 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
2976 // Figure out if XMM registers are in use.
2977 assert(!(Subtarget.useSoftFloat() &&
2978 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2979 "SSE register cannot be used when SSE is disabled!");
2981 // 64-bit calling conventions support varargs and register parameters, so we
2982 // have to do extra work to spill them in the prologue.
2983 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
2984 // Find the first unallocated argument registers.
2985 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2986 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2987 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2988 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2989 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2990 "SSE register cannot be used when SSE is disabled!");
2992 // Gather all the live in physical registers.
2993 SmallVector<SDValue, 6> LiveGPRs;
2994 SmallVector<SDValue, 8> LiveXMMRegs;
2996 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2997 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2999 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3001 if (!ArgXMMs.empty()) {
3002 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3003 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3004 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3005 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3006 LiveXMMRegs.push_back(
3007 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3012 // Get to the caller-allocated home save location. Add 8 to account
3013 // for the return address.
3014 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3015 FuncInfo->setRegSaveFrameIndex(
3016 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3017 // Fixup to set vararg frame on shadow area (4 x i64).
3019 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3021 // For X86-64, if there are vararg parameters that are passed via
3022 // registers, then we must store them to their spots on the stack so
3023 // they may be loaded by dereferencing the result of va_next.
3024 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3025 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3026 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3027 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3030 // Store the integer parameter registers.
3031 SmallVector<SDValue, 8> MemOps;
3032 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3033 getPointerTy(DAG.getDataLayout()));
3034 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3035 for (SDValue Val : LiveGPRs) {
3036 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3037 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3039 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3040 MachinePointerInfo::getFixedStack(
3041 DAG.getMachineFunction(),
3042 FuncInfo->getRegSaveFrameIndex(), Offset));
3043 MemOps.push_back(Store);
3047 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3048 // Now store the XMM (fp + vector) parameter registers.
3049 SmallVector<SDValue, 12> SaveXMMOps;
3050 SaveXMMOps.push_back(Chain);
3051 SaveXMMOps.push_back(ALVal);
3052 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3053 FuncInfo->getRegSaveFrameIndex(), dl));
3054 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3055 FuncInfo->getVarArgsFPOffset(), dl));
3056 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3058 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3059 MVT::Other, SaveXMMOps));
3062 if (!MemOps.empty())
3063 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3066 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3067 // Find the largest legal vector type.
3068 MVT VecVT = MVT::Other;
3069 // FIXME: Only some x86_32 calling conventions support AVX512.
3070 if (Subtarget.hasAVX512() &&
3071 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3072 CallConv == CallingConv::Intel_OCL_BI)))
3073 VecVT = MVT::v16f32;
3074 else if (Subtarget.hasAVX())
3076 else if (Subtarget.hasSSE2())
3079 // We forward some GPRs and some vector types.
3080 SmallVector<MVT, 2> RegParmTypes;
3081 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3082 RegParmTypes.push_back(IntVT);
3083 if (VecVT != MVT::Other)
3084 RegParmTypes.push_back(VecVT);
3086 // Compute the set of forwarded registers. The rest are scratch.
3087 SmallVectorImpl<ForwardedRegister> &Forwards =
3088 FuncInfo->getForwardedMustTailRegParms();
3089 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3091 // Conservatively forward AL on x86_64, since it might be used for varargs.
3092 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3093 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3094 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3097 // Copy all forwards from physical to virtual registers.
3098 for (ForwardedRegister &F : Forwards) {
3099 // FIXME: Can we use a less constrained schedule?
3100 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3101 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3102 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3106 // Some CCs need callee pop.
3107 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3108 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3109 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3110 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3111 // X86 interrupts must pop the error code if present
3112 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
3114 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3115 // If this is an sret function, the return should pop the hidden pointer.
3116 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3117 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3118 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3119 FuncInfo->setBytesToPopOnReturn(4);
3123 // RegSaveFrameIndex is X86-64 only.
3124 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3125 if (CallConv == CallingConv::X86_FastCall ||
3126 CallConv == CallingConv::X86_ThisCall)
3127 // fastcc functions can't have varargs.
3128 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3131 FuncInfo->setArgumentStackSize(StackSize);
3133 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3134 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3135 if (Personality == EHPersonality::CoreCLR) {
3137 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3138 // that we'd prefer this slot be allocated towards the bottom of the frame
3139 // (i.e. near the stack pointer after allocating the frame). Every
3140 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3141 // offset from the bottom of this and each funclet's frame must be the
3142 // same, so the size of funclets' (mostly empty) frames is dictated by
3143 // how far this slot is from the bottom (since they allocate just enough
3144 // space to accommodate holding this slot at the correct offset).
3145 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3146 EHInfo->PSPSymFrameIdx = PSPSymFI;
3153 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3154 SDValue Arg, const SDLoc &dl,
3156 const CCValAssign &VA,
3157 ISD::ArgFlagsTy Flags) const {
3158 unsigned LocMemOffset = VA.getLocMemOffset();
3159 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3160 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3162 if (Flags.isByVal())
3163 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3165 return DAG.getStore(
3166 Chain, dl, Arg, PtrOff,
3167 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3170 /// Emit a load of return address if tail call
3171 /// optimization is performed and it is required.
3172 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3173 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3174 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3175 // Adjust the Return address stack slot.
3176 EVT VT = getPointerTy(DAG.getDataLayout());
3177 OutRetAddr = getReturnAddressFrameIndex(DAG);
3179 // Load the "old" Return address.
3180 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3181 return SDValue(OutRetAddr.getNode(), 1);
3184 /// Emit a store of the return address if tail call
3185 /// optimization is performed and it is required (FPDiff!=0).
3186 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3187 SDValue Chain, SDValue RetAddrFrIdx,
3188 EVT PtrVT, unsigned SlotSize,
3189 int FPDiff, const SDLoc &dl) {
3190 // Store the return address to the appropriate stack slot.
3191 if (!FPDiff) return Chain;
3192 // Calculate the new stack slot for the return address.
3193 int NewReturnAddrFI =
3194 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3196 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3197 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3198 MachinePointerInfo::getFixedStack(
3199 DAG.getMachineFunction(), NewReturnAddrFI));
3203 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3204 /// operation of specified width.
3205 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3207 unsigned NumElems = VT.getVectorNumElements();
3208 SmallVector<int, 8> Mask;
3209 Mask.push_back(NumElems);
3210 for (unsigned i = 1; i != NumElems; ++i)
3212 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3216 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3217 SmallVectorImpl<SDValue> &InVals) const {
3218 SelectionDAG &DAG = CLI.DAG;
3220 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3221 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3222 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3223 SDValue Chain = CLI.Chain;
3224 SDValue Callee = CLI.Callee;
3225 CallingConv::ID CallConv = CLI.CallConv;
3226 bool &isTailCall = CLI.IsTailCall;
3227 bool isVarArg = CLI.IsVarArg;
3229 MachineFunction &MF = DAG.getMachineFunction();
3230 bool Is64Bit = Subtarget.is64Bit();
3231 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3232 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3233 bool IsSibcall = false;
3234 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3235 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3237 if (CallConv == CallingConv::X86_INTR)
3238 report_fatal_error("X86 interrupts may not be called directly");
3240 if (Attr.getValueAsString() == "true")
3243 if (Subtarget.isPICStyleGOT() &&
3244 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3245 // If we are using a GOT, disable tail calls to external symbols with
3246 // default visibility. Tail calling such a symbol requires using a GOT
3247 // relocation, which forces early binding of the symbol. This breaks code
3248 // that require lazy function symbol resolution. Using musttail or
3249 // GuaranteedTailCallOpt will override this.
3250 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3251 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3252 G->getGlobal()->hasDefaultVisibility()))
3256 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3258 // Force this to be a tail call. The verifier rules are enough to ensure
3259 // that we can lower this successfully without moving the return address
3262 } else if (isTailCall) {
3263 // Check if it's really possible to do a tail call.
3264 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3265 isVarArg, SR != NotStructReturn,
3266 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3267 Outs, OutVals, Ins, DAG);
3269 // Sibcalls are automatically detected tailcalls which do not require
3271 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3278 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3279 "Var args not supported with calling convention fastcc, ghc or hipe");
3281 // Analyze operands of the call, assigning locations to each operand.
3282 SmallVector<CCValAssign, 16> ArgLocs;
3283 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3285 // Allocate shadow area for Win64.
3287 CCInfo.AllocateStack(32, 8);
3289 CCInfo.AnalyzeArguments(Outs, CC_X86);
3291 // In vectorcall calling convention a second pass is required for the HVA
3293 if (CallingConv::X86_VectorCall == CallConv) {
3294 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3297 // Get a count of how many bytes are to be pushed on the stack.
3298 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3300 // This is a sibcall. The memory operands are available in caller's
3301 // own caller's stack.
3303 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3304 canGuaranteeTCO(CallConv))
3305 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3308 if (isTailCall && !IsSibcall && !IsMustTail) {
3309 // Lower arguments at fp - stackoffset + fpdiff.
3310 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3312 FPDiff = NumBytesCallerPushed - NumBytes;
3314 // Set the delta of movement of the returnaddr stackslot.
3315 // But only set if delta is greater than previous delta.
3316 if (FPDiff < X86Info->getTCReturnAddrDelta())
3317 X86Info->setTCReturnAddrDelta(FPDiff);
3320 unsigned NumBytesToPush = NumBytes;
3321 unsigned NumBytesToPop = NumBytes;
3323 // If we have an inalloca argument, all stack space has already been allocated
3324 // for us and be right at the top of the stack. We don't support multiple
3325 // arguments passed in memory when using inalloca.
3326 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3328 if (!ArgLocs.back().isMemLoc())
3329 report_fatal_error("cannot use inalloca attribute on a register "
3331 if (ArgLocs.back().getLocMemOffset() != 0)
3332 report_fatal_error("any parameter with the inalloca attribute must be "
3333 "the only memory argument");
3337 Chain = DAG.getCALLSEQ_START(
3338 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3340 SDValue RetAddrFrIdx;
3341 // Load return address for tail calls.
3342 if (isTailCall && FPDiff)
3343 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3344 Is64Bit, FPDiff, dl);
3346 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3347 SmallVector<SDValue, 8> MemOpChains;
3350 // The next loop assumes that the locations are in the same order of the
3352 if (!isSortedByValueNo(ArgLocs))
3353 llvm_unreachable("Argument Location list must be sorted before lowering");
3355 // Walk the register/memloc assignments, inserting copies/loads. In the case
3356 // of tail call optimization arguments are handle later.
3357 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3358 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3360 assert(OutIndex < Outs.size() && "Invalid Out index");
3361 // Skip inalloca arguments, they have already been written.
3362 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3363 if (Flags.isInAlloca())
3366 CCValAssign &VA = ArgLocs[I];
3367 EVT RegVT = VA.getLocVT();
3368 SDValue Arg = OutVals[OutIndex];
3369 bool isByVal = Flags.isByVal();
3371 // Promote the value if needed.
3372 switch (VA.getLocInfo()) {
3373 default: llvm_unreachable("Unknown loc info!");
3374 case CCValAssign::Full: break;
3375 case CCValAssign::SExt:
3376 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3378 case CCValAssign::ZExt:
3379 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3381 case CCValAssign::AExt:
3382 if (Arg.getValueType().isVector() &&
3383 Arg.getValueType().getVectorElementType() == MVT::i1)
3384 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3385 else if (RegVT.is128BitVector()) {
3386 // Special case: passing MMX values in XMM registers.
3387 Arg = DAG.getBitcast(MVT::i64, Arg);
3388 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3389 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3391 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3393 case CCValAssign::BCvt:
3394 Arg = DAG.getBitcast(RegVT, Arg);
3396 case CCValAssign::Indirect: {
3397 // Store the argument.
3398 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3399 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3400 Chain = DAG.getStore(
3401 Chain, dl, Arg, SpillSlot,
3402 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3408 if (VA.needsCustom()) {
3409 assert(VA.getValVT() == MVT::v64i1 &&
3410 "Currently the only custom case is when we split v64i1 to 2 regs");
3411 // Split v64i1 value into two registers
3412 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3414 } else if (VA.isRegLoc()) {
3415 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3416 if (isVarArg && IsWin64) {
3417 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3418 // shadow reg if callee is a varargs function.
3419 unsigned ShadowReg = 0;
3420 switch (VA.getLocReg()) {
3421 case X86::XMM0: ShadowReg = X86::RCX; break;
3422 case X86::XMM1: ShadowReg = X86::RDX; break;
3423 case X86::XMM2: ShadowReg = X86::R8; break;
3424 case X86::XMM3: ShadowReg = X86::R9; break;
3427 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3429 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3430 assert(VA.isMemLoc());
3431 if (!StackPtr.getNode())
3432 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3433 getPointerTy(DAG.getDataLayout()));
3434 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3435 dl, DAG, VA, Flags));
3439 if (!MemOpChains.empty())
3440 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3442 if (Subtarget.isPICStyleGOT()) {
3443 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3446 RegsToPass.push_back(std::make_pair(
3447 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3448 getPointerTy(DAG.getDataLayout()))));
3450 // If we are tail calling and generating PIC/GOT style code load the
3451 // address of the callee into ECX. The value in ecx is used as target of
3452 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3453 // for tail calls on PIC/GOT architectures. Normally we would just put the
3454 // address of GOT into ebx and then call target@PLT. But for tail calls
3455 // ebx would be restored (since ebx is callee saved) before jumping to the
3458 // Note: The actual moving to ECX is done further down.
3459 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3460 if (G && !G->getGlobal()->hasLocalLinkage() &&
3461 G->getGlobal()->hasDefaultVisibility())
3462 Callee = LowerGlobalAddress(Callee, DAG);
3463 else if (isa<ExternalSymbolSDNode>(Callee))
3464 Callee = LowerExternalSymbol(Callee, DAG);
3468 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3469 // From AMD64 ABI document:
3470 // For calls that may call functions that use varargs or stdargs
3471 // (prototype-less calls or calls to functions containing ellipsis (...) in
3472 // the declaration) %al is used as hidden argument to specify the number
3473 // of SSE registers used. The contents of %al do not need to match exactly
3474 // the number of registers, but must be an ubound on the number of SSE
3475 // registers used and is in the range 0 - 8 inclusive.
3477 // Count the number of XMM registers allocated.
3478 static const MCPhysReg XMMArgRegs[] = {
3479 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3480 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3482 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3483 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3484 && "SSE registers cannot be used when SSE is disabled");
3486 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3487 DAG.getConstant(NumXMMRegs, dl,
3491 if (isVarArg && IsMustTail) {
3492 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3493 for (const auto &F : Forwards) {
3494 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3495 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3499 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3500 // don't need this because the eligibility check rejects calls that require
3501 // shuffling arguments passed in memory.
3502 if (!IsSibcall && isTailCall) {
3503 // Force all the incoming stack arguments to be loaded from the stack
3504 // before any new outgoing arguments are stored to the stack, because the
3505 // outgoing stack slots may alias the incoming argument stack slots, and
3506 // the alias isn't otherwise explicit. This is slightly more conservative
3507 // than necessary, because it means that each store effectively depends
3508 // on every argument instead of just those arguments it would clobber.
3509 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3511 SmallVector<SDValue, 8> MemOpChains2;
3514 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3516 CCValAssign &VA = ArgLocs[I];
3518 if (VA.isRegLoc()) {
3519 if (VA.needsCustom()) {
3520 assert((CallConv == CallingConv::X86_RegCall) &&
3521 "Expecting custome case only in regcall calling convention");
3522 // This means that we are in special case where one argument was
3523 // passed through two register locations - Skip the next location
3530 assert(VA.isMemLoc());
3531 SDValue Arg = OutVals[OutsIndex];
3532 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3533 // Skip inalloca arguments. They don't require any work.
3534 if (Flags.isInAlloca())
3536 // Create frame index.
3537 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3538 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3539 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3540 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3542 if (Flags.isByVal()) {
3543 // Copy relative to framepointer.
3544 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3545 if (!StackPtr.getNode())
3546 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3547 getPointerTy(DAG.getDataLayout()));
3548 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3551 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3555 // Store relative to framepointer.
3556 MemOpChains2.push_back(DAG.getStore(
3557 ArgChain, dl, Arg, FIN,
3558 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3562 if (!MemOpChains2.empty())
3563 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3565 // Store the return address to the appropriate stack slot.
3566 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3567 getPointerTy(DAG.getDataLayout()),
3568 RegInfo->getSlotSize(), FPDiff, dl);
3571 // Build a sequence of copy-to-reg nodes chained together with token chain
3572 // and flag operands which copy the outgoing args into registers.
3574 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3575 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3576 RegsToPass[i].second, InFlag);
3577 InFlag = Chain.getValue(1);
3580 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3581 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3582 // In the 64-bit large code model, we have to make all calls
3583 // through a register, since the call instruction's 32-bit
3584 // pc-relative offset may not be large enough to hold the whole
3586 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3587 // If the callee is a GlobalAddress node (quite common, every direct call
3588 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3590 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3592 // We should use extra load for direct calls to dllimported functions in
3594 const GlobalValue *GV = G->getGlobal();
3595 if (!GV->hasDLLImportStorageClass()) {
3596 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3598 Callee = DAG.getTargetGlobalAddress(
3599 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3601 if (OpFlags == X86II::MO_GOTPCREL) {
3603 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3604 getPointerTy(DAG.getDataLayout()), Callee);
3605 // Add extra indirection
3606 Callee = DAG.getLoad(
3607 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3608 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3611 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3612 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3613 unsigned char OpFlags =
3614 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3616 Callee = DAG.getTargetExternalSymbol(
3617 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3618 } else if (Subtarget.isTarget64BitILP32() &&
3619 Callee->getValueType(0) == MVT::i32) {
3620 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3621 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3624 // Returns a chain & a flag for retval copy to use.
3625 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3626 SmallVector<SDValue, 8> Ops;
3628 if (!IsSibcall && isTailCall) {
3629 Chain = DAG.getCALLSEQ_END(Chain,
3630 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3631 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3632 InFlag = Chain.getValue(1);
3635 Ops.push_back(Chain);
3636 Ops.push_back(Callee);
3639 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3641 // Add argument registers to the end of the list so that they are known live
3643 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3644 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3645 RegsToPass[i].second.getValueType()));
3647 // Add a register mask operand representing the call-preserved registers.
3648 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3649 assert(Mask && "Missing call preserved mask for calling convention");
3651 // If this is an invoke in a 32-bit function using a funclet-based
3652 // personality, assume the function clobbers all registers. If an exception
3653 // is thrown, the runtime will not restore CSRs.
3654 // FIXME: Model this more precisely so that we can register allocate across
3655 // the normal edge and spill and fill across the exceptional edge.
3656 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3657 const Function *CallerFn = MF.getFunction();
3658 EHPersonality Pers =
3659 CallerFn->hasPersonalityFn()
3660 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3661 : EHPersonality::Unknown;
3662 if (isFuncletEHPersonality(Pers))
3663 Mask = RegInfo->getNoPreservedMask();
3666 Ops.push_back(DAG.getRegisterMask(Mask));
3668 if (InFlag.getNode())
3669 Ops.push_back(InFlag);
3673 //// If this is the first return lowered for this function, add the regs
3674 //// to the liveout set for the function.
3675 // This isn't right, although it's probably harmless on x86; liveouts
3676 // should be computed from returns not tail calls. Consider a void
3677 // function making a tail call to a function returning int.
3678 MF.getFrameInfo().setHasTailCall();
3679 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3682 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3683 InFlag = Chain.getValue(1);
3685 // Create the CALLSEQ_END node.
3686 unsigned NumBytesForCalleeToPop;
3687 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3688 DAG.getTarget().Options.GuaranteedTailCallOpt))
3689 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3690 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3691 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3692 SR == StackStructReturn)
3693 // If this is a call to a struct-return function, the callee
3694 // pops the hidden struct pointer, so we have to push it back.
3695 // This is common for Darwin/X86, Linux & Mingw32 targets.
3696 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3697 NumBytesForCalleeToPop = 4;
3699 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3701 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3702 // No need to reset the stack after the call if the call doesn't return. To
3703 // make the MI verify, we'll pretend the callee does it for us.
3704 NumBytesForCalleeToPop = NumBytes;
3707 // Returns a flag for retval copy to use.
3709 Chain = DAG.getCALLSEQ_END(Chain,
3710 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3711 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3714 InFlag = Chain.getValue(1);
3717 // Handle result values, copying them out of physregs into vregs that we
3719 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3720 Ins, dl, DAG, InVals);
3723 //===----------------------------------------------------------------------===//
3724 // Fast Calling Convention (tail call) implementation
3725 //===----------------------------------------------------------------------===//
3727 // Like std call, callee cleans arguments, convention except that ECX is
3728 // reserved for storing the tail called function address. Only 2 registers are
3729 // free for argument passing (inreg). Tail call optimization is performed
3731 // * tailcallopt is enabled
3732 // * caller/callee are fastcc
3733 // On X86_64 architecture with GOT-style position independent code only local
3734 // (within module) calls are supported at the moment.
3735 // To keep the stack aligned according to platform abi the function
3736 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3737 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3738 // If a tail called function callee has more arguments than the caller the
3739 // caller needs to make sure that there is room to move the RETADDR to. This is
3740 // achieved by reserving an area the size of the argument delta right after the
3741 // original RETADDR, but before the saved framepointer or the spilled registers
3742 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3754 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3757 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3758 SelectionDAG& DAG) const {
3759 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3760 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3761 unsigned StackAlignment = TFI.getStackAlignment();
3762 uint64_t AlignMask = StackAlignment - 1;
3763 int64_t Offset = StackSize;
3764 unsigned SlotSize = RegInfo->getSlotSize();
3765 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3766 // Number smaller than 12 so just add the difference.
3767 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3769 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3770 Offset = ((~AlignMask) & Offset) + StackAlignment +
3771 (StackAlignment-SlotSize);
3776 /// Return true if the given stack call argument is already available in the
3777 /// same position (relatively) of the caller's incoming argument stack.
3779 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3780 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3781 const X86InstrInfo *TII, const CCValAssign &VA) {
3782 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3785 // Look through nodes that don't alter the bits of the incoming value.
3786 unsigned Op = Arg.getOpcode();
3787 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3788 Arg = Arg.getOperand(0);
3791 if (Op == ISD::TRUNCATE) {
3792 const SDValue &TruncInput = Arg.getOperand(0);
3793 if (TruncInput.getOpcode() == ISD::AssertZext &&
3794 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3795 Arg.getValueType()) {
3796 Arg = TruncInput.getOperand(0);
3804 if (Arg.getOpcode() == ISD::CopyFromReg) {
3805 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3806 if (!TargetRegisterInfo::isVirtualRegister(VR))
3808 MachineInstr *Def = MRI->getVRegDef(VR);
3811 if (!Flags.isByVal()) {
3812 if (!TII->isLoadFromStackSlot(*Def, FI))
3815 unsigned Opcode = Def->getOpcode();
3816 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3817 Opcode == X86::LEA64_32r) &&
3818 Def->getOperand(1).isFI()) {
3819 FI = Def->getOperand(1).getIndex();
3820 Bytes = Flags.getByValSize();
3824 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3825 if (Flags.isByVal())
3826 // ByVal argument is passed in as a pointer but it's now being
3827 // dereferenced. e.g.
3828 // define @foo(%struct.X* %A) {
3829 // tail call @bar(%struct.X* byval %A)
3832 SDValue Ptr = Ld->getBasePtr();
3833 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3836 FI = FINode->getIndex();
3837 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3838 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3839 FI = FINode->getIndex();
3840 Bytes = Flags.getByValSize();
3844 assert(FI != INT_MAX);
3845 if (!MFI.isFixedObjectIndex(FI))
3848 if (Offset != MFI.getObjectOffset(FI))
3851 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3852 // If the argument location is wider than the argument type, check that any
3853 // extension flags match.
3854 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3855 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3860 return Bytes == MFI.getObjectSize(FI);
3863 /// Check whether the call is eligible for tail call optimization. Targets
3864 /// that want to do tail call optimization should implement this function.
3865 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3866 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3867 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3868 const SmallVectorImpl<ISD::OutputArg> &Outs,
3869 const SmallVectorImpl<SDValue> &OutVals,
3870 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3871 if (!mayTailCallThisCC(CalleeCC))
3874 // If -tailcallopt is specified, make fastcc functions tail-callable.
3875 MachineFunction &MF = DAG.getMachineFunction();
3876 const Function *CallerF = MF.getFunction();
3878 // If the function return type is x86_fp80 and the callee return type is not,
3879 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3880 // perform a tailcall optimization here.
3881 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3884 CallingConv::ID CallerCC = CallerF->getCallingConv();
3885 bool CCMatch = CallerCC == CalleeCC;
3886 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3887 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3889 // Win64 functions have extra shadow space for argument homing. Don't do the
3890 // sibcall if the caller and callee have mismatched expectations for this
3892 if (IsCalleeWin64 != IsCallerWin64)
3895 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3896 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3901 // Look for obvious safe cases to perform tail call optimization that do not
3902 // require ABI changes. This is what gcc calls sibcall.
3904 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3905 // emit a special epilogue.
3906 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3907 if (RegInfo->needsStackRealignment(MF))
3910 // Also avoid sibcall optimization if either caller or callee uses struct
3911 // return semantics.
3912 if (isCalleeStructRet || isCallerStructRet)
3915 // Do not sibcall optimize vararg calls unless all arguments are passed via
3917 LLVMContext &C = *DAG.getContext();
3918 if (isVarArg && !Outs.empty()) {
3919 // Optimizing for varargs on Win64 is unlikely to be safe without
3920 // additional testing.
3921 if (IsCalleeWin64 || IsCallerWin64)
3924 SmallVector<CCValAssign, 16> ArgLocs;
3925 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3927 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3928 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3929 if (!ArgLocs[i].isRegLoc())
3933 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3934 // stack. Therefore, if it's not used by the call it is not safe to optimize
3935 // this into a sibcall.
3936 bool Unused = false;
3937 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3944 SmallVector<CCValAssign, 16> RVLocs;
3945 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3946 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3947 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3948 CCValAssign &VA = RVLocs[i];
3949 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3954 // Check that the call results are passed in the same way.
3955 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3956 RetCC_X86, RetCC_X86))
3958 // The callee has to preserve all registers the caller needs to preserve.
3959 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3960 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3962 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3963 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3967 unsigned StackArgsSize = 0;
3969 // If the callee takes no arguments then go on to check the results of the
3971 if (!Outs.empty()) {
3972 // Check if stack adjustment is needed. For now, do not do this if any
3973 // argument is passed on the stack.
3974 SmallVector<CCValAssign, 16> ArgLocs;
3975 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3977 // Allocate shadow area for Win64
3979 CCInfo.AllocateStack(32, 8);
3981 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3982 StackArgsSize = CCInfo.getNextStackOffset();
3984 if (CCInfo.getNextStackOffset()) {
3985 // Check if the arguments are already laid out in the right way as
3986 // the caller's fixed stack objects.
3987 MachineFrameInfo &MFI = MF.getFrameInfo();
3988 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3989 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3990 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3991 CCValAssign &VA = ArgLocs[i];
3992 SDValue Arg = OutVals[i];
3993 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3994 if (VA.getLocInfo() == CCValAssign::Indirect)
3996 if (!VA.isRegLoc()) {
3997 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4004 bool PositionIndependent = isPositionIndependent();
4005 // If the tailcall address may be in a register, then make sure it's
4006 // possible to register allocate for it. In 32-bit, the call address can
4007 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4008 // callee-saved registers are restored. These happen to be the same
4009 // registers used to pass 'inreg' arguments so watch out for those.
4010 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4011 !isa<ExternalSymbolSDNode>(Callee)) ||
4012 PositionIndependent)) {
4013 unsigned NumInRegs = 0;
4014 // In PIC we need an extra register to formulate the address computation
4016 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4018 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4019 CCValAssign &VA = ArgLocs[i];
4022 unsigned Reg = VA.getLocReg();
4025 case X86::EAX: case X86::EDX: case X86::ECX:
4026 if (++NumInRegs == MaxInRegs)
4033 const MachineRegisterInfo &MRI = MF.getRegInfo();
4034 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4038 bool CalleeWillPop =
4039 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4040 MF.getTarget().Options.GuaranteedTailCallOpt);
4042 if (unsigned BytesToPop =
4043 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4044 // If we have bytes to pop, the callee must pop them.
4045 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4046 if (!CalleePopMatches)
4048 } else if (CalleeWillPop && StackArgsSize > 0) {
4049 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4057 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4058 const TargetLibraryInfo *libInfo) const {
4059 return X86::createFastISel(funcInfo, libInfo);
4062 //===----------------------------------------------------------------------===//
4063 // Other Lowering Hooks
4064 //===----------------------------------------------------------------------===//
4066 static bool MayFoldLoad(SDValue Op) {
4067 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4070 static bool MayFoldIntoStore(SDValue Op) {
4071 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4074 static bool MayFoldIntoZeroExtend(SDValue Op) {
4075 if (Op.hasOneUse()) {
4076 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4077 return (ISD::ZERO_EXTEND == Opcode);
4082 static bool isTargetShuffle(unsigned Opcode) {
4084 default: return false;
4085 case X86ISD::BLENDI:
4086 case X86ISD::PSHUFB:
4087 case X86ISD::PSHUFD:
4088 case X86ISD::PSHUFHW:
4089 case X86ISD::PSHUFLW:
4091 case X86ISD::INSERTPS:
4092 case X86ISD::PALIGNR:
4093 case X86ISD::VSHLDQ:
4094 case X86ISD::VSRLDQ:
4095 case X86ISD::MOVLHPS:
4096 case X86ISD::MOVLHPD:
4097 case X86ISD::MOVHLPS:
4098 case X86ISD::MOVLPS:
4099 case X86ISD::MOVLPD:
4100 case X86ISD::MOVSHDUP:
4101 case X86ISD::MOVSLDUP:
4102 case X86ISD::MOVDDUP:
4105 case X86ISD::UNPCKL:
4106 case X86ISD::UNPCKH:
4107 case X86ISD::VBROADCAST:
4108 case X86ISD::VPERMILPI:
4109 case X86ISD::VPERMILPV:
4110 case X86ISD::VPERM2X128:
4111 case X86ISD::VPERMIL2:
4112 case X86ISD::VPERMI:
4113 case X86ISD::VPPERM:
4114 case X86ISD::VPERMV:
4115 case X86ISD::VPERMV3:
4116 case X86ISD::VPERMIV3:
4117 case X86ISD::VZEXT_MOVL:
4122 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4124 default: return false;
4126 case X86ISD::PSHUFB:
4127 case X86ISD::VPERMILPV:
4128 case X86ISD::VPERMIL2:
4129 case X86ISD::VPPERM:
4130 case X86ISD::VPERMV:
4131 case X86ISD::VPERMV3:
4132 case X86ISD::VPERMIV3:
4134 // 'Faux' Target Shuffles.
4140 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4141 MachineFunction &MF = DAG.getMachineFunction();
4142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4143 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4144 int ReturnAddrIndex = FuncInfo->getRAIndex();
4146 if (ReturnAddrIndex == 0) {
4147 // Set up a frame object for the return address.
4148 unsigned SlotSize = RegInfo->getSlotSize();
4149 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4152 FuncInfo->setRAIndex(ReturnAddrIndex);
4155 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4158 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4159 bool hasSymbolicDisplacement) {
4160 // Offset should fit into 32 bit immediate field.
4161 if (!isInt<32>(Offset))
4164 // If we don't have a symbolic displacement - we don't have any extra
4166 if (!hasSymbolicDisplacement)
4169 // FIXME: Some tweaks might be needed for medium code model.
4170 if (M != CodeModel::Small && M != CodeModel::Kernel)
4173 // For small code model we assume that latest object is 16MB before end of 31
4174 // bits boundary. We may also accept pretty large negative constants knowing
4175 // that all objects are in the positive half of address space.
4176 if (M == CodeModel::Small && Offset < 16*1024*1024)
4179 // For kernel code model we know that all object resist in the negative half
4180 // of 32bits address space. We may not accept negative offsets, since they may
4181 // be just off and we may accept pretty large positive ones.
4182 if (M == CodeModel::Kernel && Offset >= 0)
4188 /// Determines whether the callee is required to pop its own arguments.
4189 /// Callee pop is necessary to support tail calls.
4190 bool X86::isCalleePop(CallingConv::ID CallingConv,
4191 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4192 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4193 // can guarantee TCO.
4194 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4197 switch (CallingConv) {
4200 case CallingConv::X86_StdCall:
4201 case CallingConv::X86_FastCall:
4202 case CallingConv::X86_ThisCall:
4203 case CallingConv::X86_VectorCall:
4208 /// \brief Return true if the condition is an unsigned comparison operation.
4209 static bool isX86CCUnsigned(unsigned X86CC) {
4212 llvm_unreachable("Invalid integer condition!");
4228 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4229 switch (SetCCOpcode) {
4230 default: llvm_unreachable("Invalid integer condition!");
4231 case ISD::SETEQ: return X86::COND_E;
4232 case ISD::SETGT: return X86::COND_G;
4233 case ISD::SETGE: return X86::COND_GE;
4234 case ISD::SETLT: return X86::COND_L;
4235 case ISD::SETLE: return X86::COND_LE;
4236 case ISD::SETNE: return X86::COND_NE;
4237 case ISD::SETULT: return X86::COND_B;
4238 case ISD::SETUGT: return X86::COND_A;
4239 case ISD::SETULE: return X86::COND_BE;
4240 case ISD::SETUGE: return X86::COND_AE;
4244 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4245 /// condition code, returning the condition code and the LHS/RHS of the
4246 /// comparison to make.
4247 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4248 bool isFP, SDValue &LHS, SDValue &RHS,
4249 SelectionDAG &DAG) {
4251 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4252 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4253 // X > -1 -> X == 0, jump !sign.
4254 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4255 return X86::COND_NS;
4257 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4258 // X < 0 -> X == 0, jump on sign.
4261 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4263 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4264 return X86::COND_LE;
4268 return TranslateIntegerX86CC(SetCCOpcode);
4271 // First determine if it is required or is profitable to flip the operands.
4273 // If LHS is a foldable load, but RHS is not, flip the condition.
4274 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4275 !ISD::isNON_EXTLoad(RHS.getNode())) {
4276 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4277 std::swap(LHS, RHS);
4280 switch (SetCCOpcode) {
4286 std::swap(LHS, RHS);
4290 // On a floating point condition, the flags are set as follows:
4292 // 0 | 0 | 0 | X > Y
4293 // 0 | 0 | 1 | X < Y
4294 // 1 | 0 | 0 | X == Y
4295 // 1 | 1 | 1 | unordered
4296 switch (SetCCOpcode) {
4297 default: llvm_unreachable("Condcode should be pre-legalized away");
4299 case ISD::SETEQ: return X86::COND_E;
4300 case ISD::SETOLT: // flipped
4302 case ISD::SETGT: return X86::COND_A;
4303 case ISD::SETOLE: // flipped
4305 case ISD::SETGE: return X86::COND_AE;
4306 case ISD::SETUGT: // flipped
4308 case ISD::SETLT: return X86::COND_B;
4309 case ISD::SETUGE: // flipped
4311 case ISD::SETLE: return X86::COND_BE;
4313 case ISD::SETNE: return X86::COND_NE;
4314 case ISD::SETUO: return X86::COND_P;
4315 case ISD::SETO: return X86::COND_NP;
4317 case ISD::SETUNE: return X86::COND_INVALID;
4321 /// Is there a floating point cmov for the specific X86 condition code?
4322 /// Current x86 isa includes the following FP cmov instructions:
4323 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4324 static bool hasFPCMov(unsigned X86CC) {
4341 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4343 unsigned Intrinsic) const {
4345 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4349 Info.opc = ISD::INTRINSIC_W_CHAIN;
4350 Info.readMem = false;
4351 Info.writeMem = false;
4355 switch (IntrData->Type) {
4356 case EXPAND_FROM_MEM: {
4357 Info.ptrVal = I.getArgOperand(0);
4358 Info.memVT = MVT::getVT(I.getType());
4360 Info.readMem = true;
4363 case COMPRESS_TO_MEM: {
4364 Info.ptrVal = I.getArgOperand(0);
4365 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4367 Info.writeMem = true;
4370 case TRUNCATE_TO_MEM_VI8:
4371 case TRUNCATE_TO_MEM_VI16:
4372 case TRUNCATE_TO_MEM_VI32: {
4373 Info.ptrVal = I.getArgOperand(0);
4374 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4375 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4376 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4378 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4379 ScalarVT = MVT::i16;
4380 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4381 ScalarVT = MVT::i32;
4383 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4385 Info.writeMem = true;
4395 /// Returns true if the target can instruction select the
4396 /// specified FP immediate natively. If false, the legalizer will
4397 /// materialize the FP immediate as a load from a constant pool.
4398 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4399 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4400 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4406 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4407 ISD::LoadExtType ExtTy,
4409 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4410 // relocation target a movq or addq instruction: don't let the load shrink.
4411 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4412 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4413 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4414 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4418 /// \brief Returns true if it is beneficial to convert a load of a constant
4419 /// to just the constant itself.
4420 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4422 assert(Ty->isIntegerTy());
4424 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4425 if (BitSize == 0 || BitSize > 64)
4430 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4431 unsigned Index) const {
4432 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4435 return (Index == 0 || Index == ResVT.getVectorNumElements());
4438 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4439 // Speculate cttz only if we can directly use TZCNT.
4440 return Subtarget.hasBMI();
4443 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4444 // Speculate ctlz only if we can directly use LZCNT.
4445 return Subtarget.hasLZCNT();
4448 bool X86TargetLowering::isCtlzFast() const {
4449 return Subtarget.hasFastLZCNT();
4452 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4453 if (!Subtarget.hasBMI())
4456 // There are only 32-bit and 64-bit forms for 'andn'.
4457 EVT VT = Y.getValueType();
4458 if (VT != MVT::i32 && VT != MVT::i64)
4464 /// Val is the undef sentinel value or equal to the specified value.
4465 static bool isUndefOrEqual(int Val, int CmpVal) {
4466 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4469 /// Val is either the undef or zero sentinel value.
4470 static bool isUndefOrZero(int Val) {
4471 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4474 /// Return true if every element in Mask, beginning
4475 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4476 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4477 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4478 if (Mask[i] != SM_SentinelUndef)
4483 /// Return true if Val is undef or if its value falls within the
4484 /// specified range (L, H].
4485 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4486 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4489 /// Return true if every element in Mask is undef or if its value
4490 /// falls within the specified range (L, H].
4491 static bool isUndefOrInRange(ArrayRef<int> Mask,
4494 if (!isUndefOrInRange(M, Low, Hi))
4499 /// Return true if Val is undef, zero or if its value falls within the
4500 /// specified range (L, H].
4501 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4502 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4505 /// Return true if every element in Mask is undef, zero or if its value
4506 /// falls within the specified range (L, H].
4507 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4509 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4514 /// Return true if every element in Mask, beginning
4515 /// from position Pos and ending in Pos+Size, falls within the specified
4516 /// sequential range (Low, Low+Size]. or is undef.
4517 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4518 unsigned Pos, unsigned Size, int Low) {
4519 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4520 if (!isUndefOrEqual(Mask[i], Low))
4525 /// Return true if every element in Mask, beginning
4526 /// from position Pos and ending in Pos+Size, falls within the specified
4527 /// sequential range (Low, Low+Size], or is undef or is zero.
4528 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4529 unsigned Size, int Low) {
4530 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4531 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4536 /// Return true if every element in Mask, beginning
4537 /// from position Pos and ending in Pos+Size is undef or is zero.
4538 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4540 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4541 if (!isUndefOrZero(Mask[i]))
4546 /// \brief Helper function to test whether a shuffle mask could be
4547 /// simplified by widening the elements being shuffled.
4549 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4550 /// leaves it in an unspecified state.
4552 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4553 /// shuffle masks. The latter have the special property of a '-2' representing
4554 /// a zero-ed lane of a vector.
4555 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4556 SmallVectorImpl<int> &WidenedMask) {
4557 WidenedMask.assign(Mask.size() / 2, 0);
4558 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4559 // If both elements are undef, its trivial.
4560 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
4561 WidenedMask[i / 2] = SM_SentinelUndef;
4565 // Check for an undef mask and a mask value properly aligned to fit with
4566 // a pair of values. If we find such a case, use the non-undef mask's value.
4567 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
4568 Mask[i + 1] % 2 == 1) {
4569 WidenedMask[i / 2] = Mask[i + 1] / 2;
4572 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
4573 WidenedMask[i / 2] = Mask[i] / 2;
4577 // When zeroing, we need to spread the zeroing across both lanes to widen.
4578 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
4579 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
4580 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
4581 WidenedMask[i / 2] = SM_SentinelZero;
4587 // Finally check if the two mask values are adjacent and aligned with
4589 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
4590 Mask[i] + 1 == Mask[i + 1]) {
4591 WidenedMask[i / 2] = Mask[i] / 2;
4595 // Otherwise we can't safely widen the elements used in this shuffle.
4598 assert(WidenedMask.size() == Mask.size() / 2 &&
4599 "Incorrect size of mask after widening the elements!");
4604 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4605 /// mask index with the scaled sequential indices for an equivalent narrowed
4606 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4608 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4609 SmallVectorImpl<int> &ScaledMask) {
4610 assert(0 < Scale && "Unexpected scaling factor");
4611 int NumElts = Mask.size();
4612 ScaledMask.assign(NumElts * Scale, -1);
4614 for (int i = 0; i != NumElts; ++i) {
4617 // Repeat sentinel values in every mask element.
4619 for (int s = 0; s != Scale; ++s)
4620 ScaledMask[(Scale * i) + s] = M;
4624 // Scale mask element and increment across each mask element.
4625 for (int s = 0; s != Scale; ++s)
4626 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4630 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4631 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4632 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4633 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4634 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4637 // The index should be aligned on a vecWidth-bit boundary.
4639 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4641 MVT VT = N->getSimpleValueType(0);
4642 unsigned ElSize = VT.getScalarSizeInBits();
4643 bool Result = (Index * ElSize) % vecWidth == 0;
4648 /// Return true if the specified INSERT_SUBVECTOR
4649 /// operand specifies a subvector insert that is suitable for input to
4650 /// insertion of 128 or 256-bit subvectors
4651 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4652 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4653 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4655 // The index should be aligned on a vecWidth-bit boundary.
4657 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4659 MVT VT = N->getSimpleValueType(0);
4660 unsigned ElSize = VT.getScalarSizeInBits();
4661 bool Result = (Index * ElSize) % vecWidth == 0;
4666 bool X86::isVINSERT128Index(SDNode *N) {
4667 return isVINSERTIndex(N, 128);
4670 bool X86::isVINSERT256Index(SDNode *N) {
4671 return isVINSERTIndex(N, 256);
4674 bool X86::isVEXTRACT128Index(SDNode *N) {
4675 return isVEXTRACTIndex(N, 128);
4678 bool X86::isVEXTRACT256Index(SDNode *N) {
4679 return isVEXTRACTIndex(N, 256);
4682 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4683 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4684 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4685 "Illegal extract subvector for VEXTRACT");
4688 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4690 MVT VecVT = N->getOperand(0).getSimpleValueType();
4691 MVT ElVT = VecVT.getVectorElementType();
4693 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4694 return Index / NumElemsPerChunk;
4697 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4698 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4699 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4700 "Illegal insert subvector for VINSERT");
4703 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4705 MVT VecVT = N->getSimpleValueType(0);
4706 MVT ElVT = VecVT.getVectorElementType();
4708 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4709 return Index / NumElemsPerChunk;
4712 /// Return the appropriate immediate to extract the specified
4713 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4714 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4715 return getExtractVEXTRACTImmediate(N, 128);
4718 /// Return the appropriate immediate to extract the specified
4719 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4720 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4721 return getExtractVEXTRACTImmediate(N, 256);
4724 /// Return the appropriate immediate to insert at the specified
4725 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4726 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4727 return getInsertVINSERTImmediate(N, 128);
4730 /// Return the appropriate immediate to insert at the specified
4731 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4732 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4733 return getInsertVINSERTImmediate(N, 256);
4736 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4737 bool X86::isZeroNode(SDValue Elt) {
4738 return isNullConstant(Elt) || isNullFPConstant(Elt);
4741 // Build a vector of constants
4742 // Use an UNDEF node if MaskElt == -1.
4743 // Spilt 64-bit constants in the 32-bit mode.
4744 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4745 const SDLoc &dl, bool IsMask = false) {
4747 SmallVector<SDValue, 32> Ops;
4750 MVT ConstVecVT = VT;
4751 unsigned NumElts = VT.getVectorNumElements();
4752 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4753 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4754 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4758 MVT EltVT = ConstVecVT.getVectorElementType();
4759 for (unsigned i = 0; i < NumElts; ++i) {
4760 bool IsUndef = Values[i] < 0 && IsMask;
4761 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4762 DAG.getConstant(Values[i], dl, EltVT);
4763 Ops.push_back(OpNode);
4765 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4766 DAG.getConstant(0, dl, EltVT));
4768 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4770 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4774 static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
4775 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4776 assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
4777 SmallVector<SDValue, 32> Ops;
4780 MVT ConstVecVT = VT;
4781 unsigned NumElts = VT.getVectorNumElements();
4782 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4783 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4784 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4788 MVT EltVT = ConstVecVT.getVectorElementType();
4789 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4791 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4794 const APInt &V = Bits[i];
4795 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4797 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4798 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4799 } else if (EltVT == MVT::f32) {
4800 APFloat FV(APFloat::IEEEsingle(), V);
4801 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4802 } else if (EltVT == MVT::f64) {
4803 APFloat FV(APFloat::IEEEdouble(), V);
4804 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4806 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4810 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4811 return DAG.getBitcast(VT, ConstsNode);
4814 /// Returns a vector of specified type with all zero elements.
4815 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4816 SelectionDAG &DAG, const SDLoc &dl) {
4817 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4818 VT.getVectorElementType() == MVT::i1) &&
4819 "Unexpected vector type");
4821 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4822 // type. This ensures they get CSE'd. But if the integer type is not
4823 // available, use a floating-point +0.0 instead.
4825 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4826 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4827 } else if (VT.getVectorElementType() == MVT::i1) {
4828 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4829 "Unexpected vector type");
4830 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4831 "Unexpected vector type");
4832 Vec = DAG.getConstant(0, dl, VT);
4834 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4835 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4837 return DAG.getBitcast(VT, Vec);
4840 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4841 const SDLoc &dl, unsigned vectorWidth) {
4842 EVT VT = Vec.getValueType();
4843 EVT ElVT = VT.getVectorElementType();
4844 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4845 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4846 VT.getVectorNumElements()/Factor);
4848 // Extract from UNDEF is UNDEF.
4850 return DAG.getUNDEF(ResultVT);
4852 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4853 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4854 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4856 // This is the index of the first element of the vectorWidth-bit chunk
4857 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4858 IdxVal &= ~(ElemsPerChunk - 1);
4860 // If the input is a buildvector just emit a smaller one.
4861 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4862 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
4863 makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4865 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4866 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4869 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4870 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4871 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4872 /// instructions or a simple subregister reference. Idx is an index in the
4873 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4874 /// lowering EXTRACT_VECTOR_ELT operations easier.
4875 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4876 SelectionDAG &DAG, const SDLoc &dl) {
4877 assert((Vec.getValueType().is256BitVector() ||
4878 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4879 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4882 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4883 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4884 SelectionDAG &DAG, const SDLoc &dl) {
4885 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4886 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4889 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4890 SelectionDAG &DAG, const SDLoc &dl,
4891 unsigned vectorWidth) {
4892 assert((vectorWidth == 128 || vectorWidth == 256) &&
4893 "Unsupported vector width");
4894 // Inserting UNDEF is Result
4897 EVT VT = Vec.getValueType();
4898 EVT ElVT = VT.getVectorElementType();
4899 EVT ResultVT = Result.getValueType();
4901 // Insert the relevant vectorWidth bits.
4902 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4903 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4905 // This is the index of the first element of the vectorWidth-bit chunk
4906 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4907 IdxVal &= ~(ElemsPerChunk - 1);
4909 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4910 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4913 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4914 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4915 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4916 /// simple superregister reference. Idx is an index in the 128 bits
4917 /// we want. It need not be aligned to a 128-bit boundary. That makes
4918 /// lowering INSERT_VECTOR_ELT operations easier.
4919 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4920 SelectionDAG &DAG, const SDLoc &dl) {
4921 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4923 // For insertion into the zero index (low half) of a 256-bit vector, it is
4924 // more efficient to generate a blend with immediate instead of an insert*128.
4925 // We are still creating an INSERT_SUBVECTOR below with an undef node to
4926 // extend the subvector to the size of the result vector. Make sure that
4927 // we are not recursing on that node by checking for undef here.
4928 if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4929 !Result.isUndef()) {
4930 EVT ResultVT = Result.getValueType();
4931 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4932 SDValue Undef = DAG.getUNDEF(ResultVT);
4933 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4936 // The blend instruction, and therefore its mask, depend on the data type.
4937 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4938 if (ScalarType.isFloatingPoint()) {
4939 // Choose either vblendps (float) or vblendpd (double).
4940 unsigned ScalarSize = ScalarType.getSizeInBits();
4941 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4942 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4943 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4944 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4947 const X86Subtarget &Subtarget =
4948 static_cast<const X86Subtarget &>(DAG.getSubtarget());
4950 // AVX2 is needed for 256-bit integer blend support.
4951 // Integers must be cast to 32-bit because there is only vpblendd;
4952 // vpblendw can't be used for this because it has a handicapped mask.
4954 // If we don't have AVX2, then cast to float. Using a wrong domain blend
4955 // is still more efficient than using the wrong domain vinsertf128 that
4956 // will be created by InsertSubVector().
4957 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4959 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4960 Result = DAG.getBitcast(CastVT, Result);
4961 Vec256 = DAG.getBitcast(CastVT, Vec256);
4962 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4963 return DAG.getBitcast(ResultVT, Vec256);
4966 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4969 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4970 SelectionDAG &DAG, const SDLoc &dl) {
4971 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4972 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4975 /// Insert i1-subvector to i1-vector.
4976 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4977 const X86Subtarget &Subtarget) {
4980 SDValue Vec = Op.getOperand(0);
4981 SDValue SubVec = Op.getOperand(1);
4982 SDValue Idx = Op.getOperand(2);
4984 if (!isa<ConstantSDNode>(Idx))
4987 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4988 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4991 MVT OpVT = Op.getSimpleValueType();
4992 MVT SubVecVT = SubVec.getSimpleValueType();
4993 unsigned NumElems = OpVT.getVectorNumElements();
4994 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4996 assert(IdxVal + SubVecNumElems <= NumElems &&
4997 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4998 "Unexpected index value in INSERT_SUBVECTOR");
5000 // There are 3 possible cases:
5001 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5002 // 2. Subvector should be inserted in the upper part
5003 // (IdxVal + SubVecNumElems == NumElems)
5004 // 3. Subvector should be inserted in the middle (for example v2i1
5005 // to v16i1, index 2)
5007 // extend to natively supported kshift
5008 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5009 MVT WideOpVT = OpVT;
5010 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5013 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5014 SDValue Undef = DAG.getUNDEF(WideOpVT);
5015 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5016 Undef, SubVec, ZeroIdx);
5018 // Extract sub-vector if require.
5019 auto ExtractSubVec = [&](SDValue V) {
5020 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5024 if (Vec.isUndef()) {
5026 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5027 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
5029 return ExtractSubVec(WideSubVec);
5032 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5033 NumElems = WideOpVT.getVectorNumElements();
5034 unsigned ShiftLeft = NumElems - SubVecNumElems;
5035 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5036 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5037 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5038 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
5039 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5040 return ExtractSubVec(Vec);
5044 // Zero lower bits of the Vec
5045 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5046 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5047 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5048 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5049 // Merge them together, SubVec should be zero extended.
5050 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5051 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5053 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5054 return ExtractSubVec(Vec);
5057 // Simple case when we put subvector in the upper part
5058 if (IdxVal + SubVecNumElems == NumElems) {
5059 // Zero upper bits of the Vec
5060 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5061 DAG.getConstant(IdxVal, dl, MVT::i8));
5062 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5063 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5064 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5065 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5066 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5067 return ExtractSubVec(Vec);
5069 // Subvector should be inserted in the middle - use shuffle
5070 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5072 SmallVector<int, 64> Mask;
5073 for (unsigned i = 0; i < NumElems; ++i)
5074 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5076 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5079 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5080 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5081 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5082 /// large BUILD_VECTORS.
5083 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5084 unsigned NumElems, SelectionDAG &DAG,
5086 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5087 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5090 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5091 unsigned NumElems, SelectionDAG &DAG,
5093 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5094 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5097 /// Returns a vector of specified type with all bits set.
5098 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5099 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
5100 /// Then bitcast to their original type, ensuring they get CSE'd.
5101 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
5102 SelectionDAG &DAG, const SDLoc &dl) {
5103 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5104 "Expected a 128/256/512-bit vector type");
5106 APInt Ones = APInt::getAllOnesValue(32);
5107 unsigned NumElts = VT.getSizeInBits() / 32;
5109 if (!Subtarget.hasInt256() && NumElts == 8) {
5110 Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
5111 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5113 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5115 return DAG.getBitcast(VT, Vec);
5118 /// Generate unpacklo/unpackhi shuffle mask.
5119 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5121 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5122 int NumElts = VT.getVectorNumElements();
5123 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5125 for (int i = 0; i < NumElts; ++i) {
5126 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5127 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5128 Pos += (Unary ? 0 : NumElts * (i % 2));
5129 Pos += (Lo ? 0 : NumEltsInLane / 2);
5130 Mask.push_back(Pos);
5134 /// Returns a vector_shuffle node for an unpackl operation.
5135 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5136 SDValue V1, SDValue V2) {
5137 SmallVector<int, 8> Mask;
5138 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5139 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5142 /// Returns a vector_shuffle node for an unpackh operation.
5143 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5144 SDValue V1, SDValue V2) {
5145 SmallVector<int, 8> Mask;
5146 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5147 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5150 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5151 /// This produces a shuffle where the low element of V2 is swizzled into the
5152 /// zero/undef vector, landing at element Idx.
5153 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5154 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5156 const X86Subtarget &Subtarget,
5157 SelectionDAG &DAG) {
5158 MVT VT = V2.getSimpleValueType();
5160 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5161 int NumElems = VT.getVectorNumElements();
5162 SmallVector<int, 16> MaskVec(NumElems);
5163 for (int i = 0; i != NumElems; ++i)
5164 // If this is the insertion idx, put the low elt of V2 here.
5165 MaskVec[i] = (i == Idx) ? NumElems : i;
5166 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5169 static SDValue peekThroughBitcasts(SDValue V) {
5170 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5171 V = V.getOperand(0);
5175 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5176 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5177 V.getOperand(0).hasOneUse())
5178 V = V.getOperand(0);
5182 static const Constant *getTargetConstantFromNode(SDValue Op) {
5183 Op = peekThroughBitcasts(Op);
5185 auto *Load = dyn_cast<LoadSDNode>(Op);
5189 SDValue Ptr = Load->getBasePtr();
5190 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5191 Ptr->getOpcode() == X86ISD::WrapperRIP)
5192 Ptr = Ptr->getOperand(0);
5194 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5195 if (!CNode || CNode->isMachineConstantPoolEntry())
5198 return dyn_cast<Constant>(CNode->getConstVal());
5201 // Extract raw constant bits from constant pools.
5202 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5203 SmallBitVector &UndefElts,
5204 SmallVectorImpl<APInt> &EltBits) {
5205 assert(UndefElts.empty() && "Expected an empty UndefElts vector");
5206 assert(EltBits.empty() && "Expected an empty EltBits vector");
5208 Op = peekThroughBitcasts(Op);
5210 EVT VT = Op.getValueType();
5211 unsigned SizeInBits = VT.getSizeInBits();
5212 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5213 unsigned NumElts = SizeInBits / EltSizeInBits;
5215 // Extract all the undef/constant element data and pack into single bitsets.
5216 APInt UndefBits(SizeInBits, 0);
5217 APInt MaskBits(SizeInBits, 0);
5219 // Split the undef/constant single bitset data into the target elements.
5220 auto SplitBitData = [&]() {
5221 UndefElts = SmallBitVector(NumElts, false);
5222 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5224 for (unsigned i = 0; i != NumElts; ++i) {
5225 APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
5226 UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
5228 // Only treat an element as UNDEF if all bits are UNDEF, otherwise
5229 // treat it as zero.
5230 if (UndefEltBits.isAllOnesValue()) {
5231 UndefElts[i] = true;
5235 APInt Bits = MaskBits.lshr(i * EltSizeInBits);
5236 Bits = Bits.zextOrTrunc(EltSizeInBits);
5237 EltBits[i] = Bits.getZExtValue();
5242 auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
5246 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5247 if (isa<UndefValue>(Cst)) {
5248 Mask = APInt::getNullValue(SizeInBits);
5249 Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
5252 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5253 Mask = CInt->getValue().zextOrTrunc(SizeInBits);
5254 Undefs = APInt::getNullValue(SizeInBits);
5257 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5258 Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
5259 Undefs = APInt::getNullValue(SizeInBits);
5265 // Extract constant bits from constant pool vector.
5266 if (auto *Cst = getTargetConstantFromNode(Op)) {
5267 Type *CstTy = Cst->getType();
5268 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5271 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5272 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
5274 if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
5276 MaskBits |= Bits.shl(i * CstEltSizeInBits);
5277 UndefBits |= Undefs.shl(i * CstEltSizeInBits);
5280 return SplitBitData();
5283 // Extract constant bits from a broadcasted constant pool scalar.
5284 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5285 EltSizeInBits <= Op.getScalarValueSizeInBits()) {
5286 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5288 if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
5289 unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
5290 unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
5291 for (unsigned i = 0; i != NumBroadcastElts; ++i) {
5292 MaskBits |= Bits.shl(i * NumBroadcastBits);
5293 UndefBits |= Undefs.shl(i * NumBroadcastBits);
5295 return SplitBitData();
5303 // TODO: Merge more of this with getTargetConstantBitsFromNode.
5304 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5305 unsigned MaskEltSizeInBits,
5306 SmallVectorImpl<uint64_t> &RawMask) {
5307 MaskNode = peekThroughBitcasts(MaskNode);
5309 MVT VT = MaskNode.getSimpleValueType();
5310 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
5311 unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
5313 // Split an APInt element into MaskEltSizeInBits sized pieces and
5314 // insert into the shuffle mask.
5315 auto SplitElementToMask = [&](APInt Element) {
5316 // Note that this is x86 and so always little endian: the low byte is
5317 // the first byte of the mask.
5318 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5319 for (int i = 0; i < Split; ++i) {
5320 APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
5321 Element = Element.lshr(MaskEltSizeInBits);
5322 RawMask.push_back(RawElt.getZExtValue());
5326 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
5327 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5328 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
5329 if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
5331 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
5332 const APInt &MaskElement = CN->getAPIntValue();
5333 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
5334 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
5335 RawMask.push_back(RawElt.getZExtValue());
5341 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
5342 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
5343 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
5344 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
5345 if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
5346 RawMask.push_back(CN->getZExtValue());
5347 RawMask.append(NumMaskElts - 1, 0);
5351 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
5352 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5353 SplitElementToMask(CN->getAPIntValue());
5354 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
5361 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
5364 // We can always decode if the buildvector is all zero constants,
5365 // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
5366 if (all_of(MaskNode->ops(), X86::isZeroNode)) {
5367 RawMask.append(NumMaskElts, 0);
5371 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5372 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
5375 for (SDValue Op : MaskNode->ops()) {
5376 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
5377 SplitElementToMask(CN->getAPIntValue());
5378 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
5379 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
5387 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5388 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5389 /// operands in \p Ops, and returns true.
5390 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5391 /// IsUnary for shuffles which use a single input multiple times, and in those
5392 /// cases it will adjust the mask to only have indices within that single input.
5393 /// It is an error to call this with non-empty Mask/Ops vectors.
5394 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5395 SmallVectorImpl<SDValue> &Ops,
5396 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5397 unsigned NumElems = VT.getVectorNumElements();
5400 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5401 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5404 bool IsFakeUnary = false;
5405 switch(N->getOpcode()) {
5406 case X86ISD::BLENDI:
5407 ImmN = N->getOperand(N->getNumOperands()-1);
5408 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411 ImmN = N->getOperand(N->getNumOperands()-1);
5412 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5413 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5415 case X86ISD::INSERTPS:
5416 ImmN = N->getOperand(N->getNumOperands()-1);
5417 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5418 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5420 case X86ISD::UNPCKH:
5421 DecodeUNPCKHMask(VT, Mask);
5422 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5424 case X86ISD::UNPCKL:
5425 DecodeUNPCKLMask(VT, Mask);
5426 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5428 case X86ISD::MOVHLPS:
5429 DecodeMOVHLPSMask(NumElems, Mask);
5430 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5432 case X86ISD::MOVLHPS:
5433 DecodeMOVLHPSMask(NumElems, Mask);
5434 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5436 case X86ISD::PALIGNR:
5437 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5438 ImmN = N->getOperand(N->getNumOperands()-1);
5439 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5440 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5441 Ops.push_back(N->getOperand(1));
5442 Ops.push_back(N->getOperand(0));
5444 case X86ISD::VSHLDQ:
5445 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5446 ImmN = N->getOperand(N->getNumOperands() - 1);
5447 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5450 case X86ISD::VSRLDQ:
5451 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5452 ImmN = N->getOperand(N->getNumOperands() - 1);
5453 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5456 case X86ISD::PSHUFD:
5457 case X86ISD::VPERMILPI:
5458 ImmN = N->getOperand(N->getNumOperands()-1);
5459 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5462 case X86ISD::PSHUFHW:
5463 ImmN = N->getOperand(N->getNumOperands()-1);
5464 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5467 case X86ISD::PSHUFLW:
5468 ImmN = N->getOperand(N->getNumOperands()-1);
5469 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5472 case X86ISD::VZEXT_MOVL:
5473 DecodeZeroMoveLowMask(VT, Mask);
5476 case X86ISD::VBROADCAST: {
5477 // We only decode broadcasts of same-sized vectors at the moment.
5478 if (N->getOperand(0).getValueType() == VT) {
5479 DecodeVectorBroadcast(VT, Mask);
5485 case X86ISD::VPERMILPV: {
5487 SDValue MaskNode = N->getOperand(1);
5488 unsigned MaskEltSize = VT.getScalarSizeInBits();
5489 SmallVector<uint64_t, 32> RawMask;
5490 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5491 DecodeVPERMILPMask(VT, RawMask, Mask);
5494 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5495 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5500 case X86ISD::PSHUFB: {
5502 SDValue MaskNode = N->getOperand(1);
5503 SmallVector<uint64_t, 32> RawMask;
5504 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5505 DecodePSHUFBMask(RawMask, Mask);
5508 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5509 DecodePSHUFBMask(C, Mask);
5514 case X86ISD::VPERMI:
5515 ImmN = N->getOperand(N->getNumOperands()-1);
5516 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5521 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5523 case X86ISD::VPERM2X128:
5524 ImmN = N->getOperand(N->getNumOperands()-1);
5525 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5526 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5528 case X86ISD::MOVSLDUP:
5529 DecodeMOVSLDUPMask(VT, Mask);
5532 case X86ISD::MOVSHDUP:
5533 DecodeMOVSHDUPMask(VT, Mask);
5536 case X86ISD::MOVDDUP:
5537 DecodeMOVDDUPMask(VT, Mask);
5540 case X86ISD::MOVLHPD:
5541 case X86ISD::MOVLPD:
5542 case X86ISD::MOVLPS:
5543 // Not yet implemented
5545 case X86ISD::VPERMIL2: {
5546 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5547 unsigned MaskEltSize = VT.getScalarSizeInBits();
5548 SDValue MaskNode = N->getOperand(2);
5549 SDValue CtrlNode = N->getOperand(3);
5550 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5551 unsigned CtrlImm = CtrlOp->getZExtValue();
5552 SmallVector<uint64_t, 32> RawMask;
5553 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5554 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5557 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5558 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5564 case X86ISD::VPPERM: {
5565 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5566 SDValue MaskNode = N->getOperand(2);
5567 SmallVector<uint64_t, 32> RawMask;
5568 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5569 DecodeVPPERMMask(RawMask, Mask);
5572 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5573 DecodeVPPERMMask(C, Mask);
5578 case X86ISD::VPERMV: {
5580 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5581 Ops.push_back(N->getOperand(1));
5582 SDValue MaskNode = N->getOperand(0);
5583 SmallVector<uint64_t, 32> RawMask;
5584 unsigned MaskEltSize = VT.getScalarSizeInBits();
5585 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5586 DecodeVPERMVMask(RawMask, Mask);
5589 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5590 DecodeVPERMVMask(C, MaskEltSize, Mask);
5595 case X86ISD::VPERMV3: {
5596 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5597 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5598 Ops.push_back(N->getOperand(0));
5599 Ops.push_back(N->getOperand(2));
5600 SDValue MaskNode = N->getOperand(1);
5601 unsigned MaskEltSize = VT.getScalarSizeInBits();
5602 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5603 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5608 case X86ISD::VPERMIV3: {
5609 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5610 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5611 Ops.push_back(N->getOperand(1));
5612 Ops.push_back(N->getOperand(2));
5613 SDValue MaskNode = N->getOperand(0);
5614 unsigned MaskEltSize = VT.getScalarSizeInBits();
5615 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5616 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5621 default: llvm_unreachable("unknown target shuffle node");
5624 // Empty mask indicates the decode failed.
5628 // Check if we're getting a shuffle mask with zero'd elements.
5629 if (!AllowSentinelZero)
5630 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5633 // If we have a fake unary shuffle, the shuffle mask is spread across two
5634 // inputs that are actually the same node. Re-map the mask to always point
5635 // into the first input.
5638 if (M >= (int)Mask.size())
5641 // If we didn't already add operands in the opcode-specific code, default to
5642 // adding 1 or 2 operands starting at 0.
5644 Ops.push_back(N->getOperand(0));
5645 if (!IsUnary || IsFakeUnary)
5646 Ops.push_back(N->getOperand(1));
5652 /// Check a target shuffle mask's inputs to see if we can set any values to
5653 /// SM_SentinelZero - this is for elements that are known to be zero
5654 /// (not just zeroable) from their inputs.
5655 /// Returns true if the target shuffle mask was decoded.
5656 static bool setTargetShuffleZeroElements(SDValue N,
5657 SmallVectorImpl<int> &Mask,
5658 SmallVectorImpl<SDValue> &Ops) {
5660 if (!isTargetShuffle(N.getOpcode()))
5663 MVT VT = N.getSimpleValueType();
5664 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5667 SDValue V1 = Ops[0];
5668 SDValue V2 = IsUnary ? V1 : Ops[1];
5670 V1 = peekThroughBitcasts(V1);
5671 V2 = peekThroughBitcasts(V2);
5673 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5676 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5680 // Determine shuffle input and normalize the mask.
5681 SDValue V = M < Size ? V1 : V2;
5684 // We are referencing an UNDEF input.
5686 Mask[i] = SM_SentinelUndef;
5690 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5691 if (V.getOpcode() != ISD::BUILD_VECTOR)
5694 // If the BUILD_VECTOR has fewer elements then the (larger) source
5695 // element must be UNDEF/ZERO.
5696 // TODO: Is it worth testing the individual bits of a constant?
5697 if ((Size % V.getNumOperands()) == 0) {
5698 int Scale = Size / V->getNumOperands();
5699 SDValue Op = V.getOperand(M / Scale);
5701 Mask[i] = SM_SentinelUndef;
5702 else if (X86::isZeroNode(Op))
5703 Mask[i] = SM_SentinelZero;
5707 // If the BUILD_VECTOR has more elements then all the (smaller) source
5708 // elements must be all UNDEF or all ZERO.
5709 if ((V.getNumOperands() % Size) == 0) {
5710 int Scale = V->getNumOperands() / Size;
5711 bool AllUndef = true;
5712 bool AllZero = true;
5713 for (int j = 0; j < Scale; ++j) {
5714 SDValue Op = V.getOperand((M * Scale) + j);
5715 AllUndef &= Op.isUndef();
5716 AllZero &= X86::isZeroNode(Op);
5719 Mask[i] = SM_SentinelUndef;
5721 Mask[i] = SM_SentinelZero;
5726 assert(VT.getVectorNumElements() == Mask.size() &&
5727 "Different mask size from vector size!");
5731 // Attempt to decode ops that could be represented as a shuffle mask.
5732 // The decoded shuffle mask may contain a different number of elements to the
5733 // destination value type.
5734 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5735 SmallVectorImpl<SDValue> &Ops) {
5739 MVT VT = N.getSimpleValueType();
5740 unsigned NumElts = VT.getVectorNumElements();
5741 unsigned NumSizeInBits = VT.getSizeInBits();
5742 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5743 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5744 "Expected byte aligned value types");
5746 unsigned Opcode = N.getOpcode();
5749 // Attempt to decode as a per-byte mask.
5750 SmallBitVector UndefElts;
5751 SmallVector<APInt, 32> EltBits;
5752 if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
5754 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5756 Mask.push_back(SM_SentinelUndef);
5759 uint64_t ByteBits = EltBits[i].getZExtValue();
5760 if (ByteBits != 0 && ByteBits != 255)
5762 Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
5764 Ops.push_back(N.getOperand(0));
5768 case X86ISD::VSRLI: {
5769 uint64_t ShiftVal = N.getConstantOperandVal(1);
5770 // Out of range bit shifts are guaranteed to be zero.
5771 if (NumBitsPerElt <= ShiftVal) {
5772 Mask.append(NumElts, SM_SentinelZero);
5776 // We can only decode 'whole byte' bit shifts as shuffles.
5777 if ((ShiftVal % 8) != 0)
5780 uint64_t ByteShift = ShiftVal / 8;
5781 unsigned NumBytes = NumSizeInBits / 8;
5782 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5783 Ops.push_back(N.getOperand(0));
5785 // Clear mask to all zeros and insert the shifted byte indices.
5786 Mask.append(NumBytes, SM_SentinelZero);
5788 if (X86ISD::VSHLI == Opcode) {
5789 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5790 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5791 Mask[i + j] = i + j - ByteShift;
5793 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5794 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5795 Mask[i + j - ByteShift] = i + j;
5799 case X86ISD::VZEXT: {
5800 // TODO - add support for VPMOVZX with smaller input vector types.
5801 SDValue Src = N.getOperand(0);
5802 MVT SrcVT = Src.getSimpleValueType();
5803 if (NumSizeInBits != SrcVT.getSizeInBits())
5805 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5814 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5815 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5816 /// remaining input indices in case we now have a unary shuffle and adjust the
5817 /// Op0/Op1 inputs accordingly.
5818 /// Returns true if the target shuffle mask was decoded.
5819 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5820 SmallVectorImpl<int> &Mask) {
5821 SmallVector<SDValue, 2> Ops;
5822 if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5823 if (!getFauxShuffleMask(Op, Mask, Ops))
5826 int NumElts = Mask.size();
5827 bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
5828 return 0 <= Idx && Idx < NumElts;
5830 bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
5832 Op0 = Op0InUse ? Ops[0] : SDValue();
5833 Op1 = Op1InUse ? Ops[1] : SDValue();
5835 // We're only using Op1 - commute the mask and inputs.
5836 if (!Op0InUse && Op1InUse) {
5847 /// Returns the scalar element that will make up the ith
5848 /// element of the result of the vector shuffle.
5849 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5852 return SDValue(); // Limit search depth.
5854 SDValue V = SDValue(N, 0);
5855 EVT VT = V.getValueType();
5856 unsigned Opcode = V.getOpcode();
5858 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5859 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5860 int Elt = SV->getMaskElt(Index);
5863 return DAG.getUNDEF(VT.getVectorElementType());
5865 unsigned NumElems = VT.getVectorNumElements();
5866 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5867 : SV->getOperand(1);
5868 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5871 // Recurse into target specific vector shuffles to find scalars.
5872 if (isTargetShuffle(Opcode)) {
5873 MVT ShufVT = V.getSimpleValueType();
5874 MVT ShufSVT = ShufVT.getVectorElementType();
5875 int NumElems = (int)ShufVT.getVectorNumElements();
5876 SmallVector<int, 16> ShuffleMask;
5877 SmallVector<SDValue, 16> ShuffleOps;
5880 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5883 int Elt = ShuffleMask[Index];
5884 if (Elt == SM_SentinelZero)
5885 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5886 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5887 if (Elt == SM_SentinelUndef)
5888 return DAG.getUNDEF(ShufSVT);
5890 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5891 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5892 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5896 // Actual nodes that may contain scalar elements
5897 if (Opcode == ISD::BITCAST) {
5898 V = V.getOperand(0);
5899 EVT SrcVT = V.getValueType();
5900 unsigned NumElems = VT.getVectorNumElements();
5902 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5906 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5907 return (Index == 0) ? V.getOperand(0)
5908 : DAG.getUNDEF(VT.getVectorElementType());
5910 if (V.getOpcode() == ISD::BUILD_VECTOR)
5911 return V.getOperand(Index);
5916 /// Custom lower build_vector of v16i8.
5917 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5918 unsigned NumNonZero, unsigned NumZero,
5920 const X86Subtarget &Subtarget,
5921 const TargetLowering &TLI) {
5929 // SSE4.1 - use PINSRB to insert each byte directly.
5930 if (Subtarget.hasSSE41()) {
5931 for (unsigned i = 0; i < 16; ++i) {
5932 bool isNonZero = (NonZeros & (1 << i)) != 0;
5936 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5938 V = DAG.getUNDEF(MVT::v16i8);
5941 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5942 MVT::v16i8, V, Op.getOperand(i),
5943 DAG.getIntPtrConstant(i, dl));
5950 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5951 for (unsigned i = 0; i < 16; ++i) {
5952 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5953 if (ThisIsNonZero && First) {
5955 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5957 V = DAG.getUNDEF(MVT::v8i16);
5962 SDValue ThisElt, LastElt;
5963 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5964 if (LastIsNonZero) {
5965 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5966 MVT::i16, Op.getOperand(i-1));
5968 if (ThisIsNonZero) {
5969 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5970 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5971 ThisElt, DAG.getConstant(8, dl, MVT::i8));
5973 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5977 if (ThisElt.getNode())
5978 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5979 DAG.getIntPtrConstant(i/2, dl));
5983 return DAG.getBitcast(MVT::v16i8, V);
5986 /// Custom lower build_vector of v8i16.
5987 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5988 unsigned NumNonZero, unsigned NumZero,
5990 const X86Subtarget &Subtarget,
5991 const TargetLowering &TLI) {
5998 for (unsigned i = 0; i < 8; ++i) {
5999 bool isNonZero = (NonZeros & (1 << i)) != 0;
6003 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6005 V = DAG.getUNDEF(MVT::v8i16);
6008 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
6009 MVT::v8i16, V, Op.getOperand(i),
6010 DAG.getIntPtrConstant(i, dl));
6017 /// Custom lower build_vector of v4i32 or v4f32.
6018 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6019 const X86Subtarget &Subtarget,
6020 const TargetLowering &TLI) {
6021 // Find all zeroable elements.
6022 std::bitset<4> Zeroable;
6023 for (int i=0; i < 4; ++i) {
6024 SDValue Elt = Op->getOperand(i);
6025 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6027 assert(Zeroable.size() - Zeroable.count() > 1 &&
6028 "We expect at least two non-zero elements!");
6030 // We only know how to deal with build_vector nodes where elements are either
6031 // zeroable or extract_vector_elt with constant index.
6032 SDValue FirstNonZero;
6033 unsigned FirstNonZeroIdx;
6034 for (unsigned i=0; i < 4; ++i) {
6037 SDValue Elt = Op->getOperand(i);
6038 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6039 !isa<ConstantSDNode>(Elt.getOperand(1)))
6041 // Make sure that this node is extracting from a 128-bit vector.
6042 MVT VT = Elt.getOperand(0).getSimpleValueType();
6043 if (!VT.is128BitVector())
6045 if (!FirstNonZero.getNode()) {
6047 FirstNonZeroIdx = i;
6051 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6052 SDValue V1 = FirstNonZero.getOperand(0);
6053 MVT VT = V1.getSimpleValueType();
6055 // See if this build_vector can be lowered as a blend with zero.
6057 unsigned EltMaskIdx, EltIdx;
6059 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6060 if (Zeroable[EltIdx]) {
6061 // The zero vector will be on the right hand side.
6062 Mask[EltIdx] = EltIdx+4;
6066 Elt = Op->getOperand(EltIdx);
6067 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6068 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
6069 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6071 Mask[EltIdx] = EltIdx;
6075 // Let the shuffle legalizer deal with blend operations.
6076 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6077 if (V1.getSimpleValueType() != VT)
6078 V1 = DAG.getBitcast(VT, V1);
6079 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6082 // See if we can lower this build_vector to a INSERTPS.
6083 if (!Subtarget.hasSSE41())
6086 SDValue V2 = Elt.getOperand(0);
6087 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6090 bool CanFold = true;
6091 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6095 SDValue Current = Op->getOperand(i);
6096 SDValue SrcVector = Current->getOperand(0);
6099 CanFold = SrcVector == V1 &&
6100 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
6106 assert(V1.getNode() && "Expected at least two non-zero elements!");
6107 if (V1.getSimpleValueType() != MVT::v4f32)
6108 V1 = DAG.getBitcast(MVT::v4f32, V1);
6109 if (V2.getSimpleValueType() != MVT::v4f32)
6110 V2 = DAG.getBitcast(MVT::v4f32, V2);
6112 // Ok, we can emit an INSERTPS instruction.
6113 unsigned ZMask = Zeroable.to_ulong();
6115 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6116 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6118 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6119 DAG.getIntPtrConstant(InsertPSMask, DL));
6120 return DAG.getBitcast(VT, Result);
6123 /// Return a vector logical shift node.
6124 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6125 SelectionDAG &DAG, const TargetLowering &TLI,
6127 assert(VT.is128BitVector() && "Unknown type for VShift");
6128 MVT ShVT = MVT::v16i8;
6129 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6130 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6131 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6132 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6133 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6134 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6137 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6138 SelectionDAG &DAG) {
6140 // Check if the scalar load can be widened into a vector load. And if
6141 // the address is "base + cst" see if the cst can be "absorbed" into
6142 // the shuffle mask.
6143 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6144 SDValue Ptr = LD->getBasePtr();
6145 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6147 EVT PVT = LD->getValueType(0);
6148 if (PVT != MVT::i32 && PVT != MVT::f32)
6153 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6154 FI = FINode->getIndex();
6156 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6157 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6158 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6159 Offset = Ptr.getConstantOperandVal(1);
6160 Ptr = Ptr.getOperand(0);
6165 // FIXME: 256-bit vector instructions don't require a strict alignment,
6166 // improve this code to support it better.
6167 unsigned RequiredAlign = VT.getSizeInBits()/8;
6168 SDValue Chain = LD->getChain();
6169 // Make sure the stack object alignment is at least 16 or 32.
6170 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6171 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6172 if (MFI.isFixedObjectIndex(FI)) {
6173 // Can't change the alignment. FIXME: It's possible to compute
6174 // the exact stack offset and reference FI + adjust offset instead.
6175 // If someone *really* cares about this. That's the way to implement it.
6178 MFI.setObjectAlignment(FI, RequiredAlign);
6182 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6183 // Ptr + (Offset & ~15).
6186 if ((Offset % RequiredAlign) & 3)
6188 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6191 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6192 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6195 int EltNo = (Offset - StartOffset) >> 2;
6196 unsigned NumElems = VT.getVectorNumElements();
6198 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6199 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6200 LD->getPointerInfo().getWithOffset(StartOffset));
6202 SmallVector<int, 8> Mask(NumElems, EltNo);
6204 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6210 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6211 /// elements can be replaced by a single large load which has the same value as
6212 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6214 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6215 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6216 SDLoc &DL, SelectionDAG &DAG,
6217 bool isAfterLegalize) {
6218 unsigned NumElems = Elts.size();
6220 int LastLoadedElt = -1;
6221 SmallBitVector LoadMask(NumElems, false);
6222 SmallBitVector ZeroMask(NumElems, false);
6223 SmallBitVector UndefMask(NumElems, false);
6225 // For each element in the initializer, see if we've found a load, zero or an
6227 for (unsigned i = 0; i < NumElems; ++i) {
6228 SDValue Elt = peekThroughBitcasts(Elts[i]);
6233 UndefMask[i] = true;
6234 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6236 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6239 // Each loaded element must be the correct fractional portion of the
6240 // requested vector load.
6241 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6246 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6247 "Incomplete element masks");
6249 // Handle Special Cases - all undef or undef/zero.
6250 if (UndefMask.count() == NumElems)
6251 return DAG.getUNDEF(VT);
6253 // FIXME: Should we return this as a BUILD_VECTOR instead?
6254 if ((ZeroMask | UndefMask).count() == NumElems)
6255 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6256 : DAG.getConstantFP(0.0, DL, VT);
6258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6259 int FirstLoadedElt = LoadMask.find_first();
6260 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6261 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6262 EVT LDBaseVT = EltBase.getValueType();
6264 // Consecutive loads can contain UNDEFS but not ZERO elements.
6265 // Consecutive loads with UNDEFs and ZEROs elements require a
6266 // an additional shuffle stage to clear the ZERO elements.
6267 bool IsConsecutiveLoad = true;
6268 bool IsConsecutiveLoadWithZeros = true;
6269 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6271 SDValue Elt = peekThroughBitcasts(Elts[i]);
6272 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6273 if (!DAG.areNonVolatileConsecutiveLoads(
6274 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6275 i - FirstLoadedElt)) {
6276 IsConsecutiveLoad = false;
6277 IsConsecutiveLoadWithZeros = false;
6280 } else if (ZeroMask[i]) {
6281 IsConsecutiveLoad = false;
6285 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6286 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6287 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6288 "Cannot merge volatile loads.");
6290 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6291 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6293 if (LDBase->hasAnyUseOfValue(1)) {
6295 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6296 SDValue(NewLd.getNode(), 1));
6297 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6298 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6299 SDValue(NewLd.getNode(), 1));
6305 // LOAD - all consecutive load/undefs (must start/end with a load).
6306 // If we have found an entire vector of loads and undefs, then return a large
6307 // load of the entire vector width starting at the base pointer.
6308 // If the vector contains zeros, then attempt to shuffle those elements.
6309 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6310 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6311 assert(LDBase && "Did not find base load for merging consecutive loads");
6312 EVT EltVT = LDBase->getValueType(0);
6313 // Ensure that the input vector size for the merged loads matches the
6314 // cumulative size of the input elements.
6315 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6318 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6321 if (IsConsecutiveLoad)
6322 return CreateLoad(VT, LDBase);
6324 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6325 // vector and a zero vector to clear out the zero elements.
6326 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6327 SmallVector<int, 4> ClearMask(NumElems, -1);
6328 for (unsigned i = 0; i < NumElems; ++i) {
6330 ClearMask[i] = i + NumElems;
6331 else if (LoadMask[i])
6334 SDValue V = CreateLoad(VT, LDBase);
6335 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6336 : DAG.getConstantFP(0.0, DL, VT);
6337 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6342 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6344 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6345 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6346 (LoadSize == 32 || LoadSize == 64) &&
6347 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6348 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6349 : MVT::getIntegerVT(LoadSize);
6350 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6351 if (TLI.isTypeLegal(VecVT)) {
6352 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6353 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6355 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6356 LDBase->getPointerInfo(),
6357 LDBase->getAlignment(),
6358 false/*isVolatile*/, true/*ReadMem*/,
6361 // Make sure the newly-created LOAD is in the same position as LDBase in
6362 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6363 // and update uses of LDBase's output chain to use the TokenFactor.
6364 if (LDBase->hasAnyUseOfValue(1)) {
6366 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6367 SDValue(ResNode.getNode(), 1));
6368 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6369 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6370 SDValue(ResNode.getNode(), 1));
6373 return DAG.getBitcast(VT, ResNode);
6380 static Constant *getConstantVector(MVT VT, APInt SplatValue,
6381 unsigned SplatBitSize, LLVMContext &C) {
6382 unsigned ScalarSize = VT.getScalarSizeInBits();
6383 unsigned NumElm = SplatBitSize / ScalarSize;
6385 SmallVector<Constant *, 32> ConstantVec;
6386 for (unsigned i = 0; i < NumElm; i++) {
6387 APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
6389 if (VT.isFloatingPoint()) {
6390 assert((ScalarSize == 32 || ScalarSize == 64) &&
6391 "Unsupported floating point scalar size");
6392 if (ScalarSize == 32)
6393 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6395 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6397 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6398 ConstantVec.push_back(Const);
6400 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6403 static bool isUseOfShuffle(SDNode *N) {
6404 for (auto *U : N->uses()) {
6405 if (isTargetShuffle(U->getOpcode()))
6407 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6408 return isUseOfShuffle(U);
6413 /// Attempt to use the vbroadcast instruction to generate a splat value for the
6414 /// following cases:
6415 /// 1. A splat BUILD_VECTOR which uses:
6416 /// a. A single scalar load, or a constant.
6417 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6418 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6419 /// a scalar load, or a constant.
6421 /// The VBROADCAST node is returned when a pattern is found,
6422 /// or SDValue() otherwise.
6423 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
6424 SelectionDAG &DAG) {
6425 // VBROADCAST requires AVX.
6426 // TODO: Splats could be generated for non-AVX CPUs using SSE
6427 // instructions, but there's less potential gain for only 128-bit vectors.
6428 if (!Subtarget.hasAVX())
6431 MVT VT = BVOp->getSimpleValueType(0);
6434 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6435 "Unsupported vector type for broadcast.");
6437 BitVector UndefElements;
6438 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6440 // We need a splat of a single value to use broadcast, and it doesn't
6441 // make any sense if the value is only in one element of the vector.
6442 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6443 APInt SplatValue, Undef;
6444 unsigned SplatBitSize;
6446 // Check if this is a repeated constant pattern suitable for broadcasting.
6447 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6448 SplatBitSize > VT.getScalarSizeInBits() &&
6449 SplatBitSize < VT.getSizeInBits()) {
6450 // Avoid replacing with broadcast when it's a use of a shuffle
6451 // instruction to preserve the present custom lowering of shuffles.
6452 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6454 // replace BUILD_VECTOR with broadcast of the repeated constants.
6455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6456 LLVMContext *Ctx = DAG.getContext();
6457 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6458 if (Subtarget.hasAVX()) {
6459 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6460 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6461 // Splatted value can fit in one INTEGER constant in constant pool.
6462 // Load the constant and broadcast it.
6463 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6464 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6465 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6466 SDValue CP = DAG.getConstantPool(C, PVT);
6467 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6469 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6471 CVT, dl, DAG.getEntryNode(), CP,
6472 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6474 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6475 MVT::getVectorVT(CVT, Repeat), Ld);
6476 return DAG.getBitcast(VT, Brdcst);
6477 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6478 // Splatted value can fit in one FLOAT constant in constant pool.
6479 // Load the constant and broadcast it.
6480 // AVX have support for 32 and 64 bit broadcast for floats only.
6481 // No 64bit integer in 32bit subtarget.
6482 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6483 Constant *C = SplatBitSize == 32
6484 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6485 SplatValue.bitsToFloat())
6486 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6487 SplatValue.bitsToDouble());
6488 SDValue CP = DAG.getConstantPool(C, PVT);
6489 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6491 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6493 CVT, dl, DAG.getEntryNode(), CP,
6494 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6496 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6497 MVT::getVectorVT(CVT, Repeat), Ld);
6498 return DAG.getBitcast(VT, Brdcst);
6499 } else if (SplatBitSize > 64) {
6500 // Load the vector of constants and broadcast it.
6501 MVT CVT = VT.getScalarType();
6502 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6504 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6505 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6506 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6508 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6509 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6511 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6512 return DAG.getBitcast(VT, Brdcst);
6519 bool ConstSplatVal =
6520 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6522 // Make sure that all of the users of a non-constant load are from the
6523 // BUILD_VECTOR node.
6524 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6527 unsigned ScalarSize = Ld.getValueSizeInBits();
6528 bool IsGE256 = (VT.getSizeInBits() >= 256);
6530 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6531 // instruction to save 8 or more bytes of constant pool data.
6532 // TODO: If multiple splats are generated to load the same constant,
6533 // it may be detrimental to overall size. There needs to be a way to detect
6534 // that condition to know if this is truly a size win.
6535 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6537 // Handle broadcasting a single constant scalar from the constant pool
6539 // On Sandybridge (no AVX2), it is still better to load a constant vector
6540 // from the constant pool and not to broadcast it from a scalar.
6541 // But override that restriction when optimizing for size.
6542 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6543 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6544 EVT CVT = Ld.getValueType();
6545 assert(!CVT.isVector() && "Must not broadcast a vector type");
6547 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6548 // For size optimization, also splat v2f64 and v2i64, and for size opt
6549 // with AVX2, also splat i8 and i16.
6550 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6551 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6552 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6553 const Constant *C = nullptr;
6554 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6555 C = CI->getConstantIntValue();
6556 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6557 C = CF->getConstantFPValue();
6559 assert(C && "Invalid constant type");
6561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6563 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6564 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6566 CVT, dl, DAG.getEntryNode(), CP,
6567 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6570 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6574 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6576 // Handle AVX2 in-register broadcasts.
6577 if (!IsLoad && Subtarget.hasInt256() &&
6578 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6579 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6581 // The scalar source must be a normal load.
6585 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6586 (Subtarget.hasVLX() && ScalarSize == 64))
6587 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6589 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6590 // double since there is no vbroadcastsd xmm
6591 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6592 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6593 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6596 // Unsupported broadcast.
6600 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6601 /// underlying vector and index.
6603 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6605 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6607 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6608 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6611 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6613 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6615 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6616 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6619 // In this case the vector is the extract_subvector expression and the index
6620 // is 2, as specified by the shuffle.
6621 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6622 SDValue ShuffleVec = SVOp->getOperand(0);
6623 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6624 assert(ShuffleVecVT.getVectorElementType() ==
6625 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6627 int ShuffleIdx = SVOp->getMaskElt(Idx);
6628 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6629 ExtractedFromVec = ShuffleVec;
6635 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6636 MVT VT = Op.getSimpleValueType();
6638 // Skip if insert_vec_elt is not supported.
6639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6640 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6644 unsigned NumElems = Op.getNumOperands();
6648 SmallVector<unsigned, 4> InsertIndices;
6649 SmallVector<int, 8> Mask(NumElems, -1);
6651 for (unsigned i = 0; i != NumElems; ++i) {
6652 unsigned Opc = Op.getOperand(i).getOpcode();
6654 if (Opc == ISD::UNDEF)
6657 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6658 // Quit if more than 1 elements need inserting.
6659 if (InsertIndices.size() > 1)
6662 InsertIndices.push_back(i);
6666 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6667 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6668 // Quit if non-constant index.
6669 if (!isa<ConstantSDNode>(ExtIdx))
6671 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6673 // Quit if extracted from vector of different type.
6674 if (ExtractedFromVec.getValueType() != VT)
6677 if (!VecIn1.getNode())
6678 VecIn1 = ExtractedFromVec;
6679 else if (VecIn1 != ExtractedFromVec) {
6680 if (!VecIn2.getNode())
6681 VecIn2 = ExtractedFromVec;
6682 else if (VecIn2 != ExtractedFromVec)
6683 // Quit if more than 2 vectors to shuffle
6687 if (ExtractedFromVec == VecIn1)
6689 else if (ExtractedFromVec == VecIn2)
6690 Mask[i] = Idx + NumElems;
6693 if (!VecIn1.getNode())
6696 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6697 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6698 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6699 unsigned Idx = InsertIndices[i];
6700 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6701 DAG.getIntPtrConstant(Idx, DL));
6707 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6708 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6709 Op.getScalarValueSizeInBits() == 1 &&
6710 "Can not convert non-constant vector");
6711 uint64_t Immediate = 0;
6712 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6713 SDValue In = Op.getOperand(idx);
6715 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6718 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6719 return DAG.getConstant(Immediate, dl, VT);
6721 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6723 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6725 MVT VT = Op.getSimpleValueType();
6726 assert((VT.getVectorElementType() == MVT::i1) &&
6727 "Unexpected type in LowerBUILD_VECTORvXi1!");
6730 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6731 return DAG.getTargetConstant(0, dl, VT);
6733 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6734 return DAG.getTargetConstant(1, dl, VT);
6736 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6737 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6738 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6739 return DAG.getBitcast(VT, Imm);
6740 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6741 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6742 DAG.getIntPtrConstant(0, dl));
6745 // Vector has one or more non-const elements
6746 uint64_t Immediate = 0;
6747 SmallVector<unsigned, 16> NonConstIdx;
6748 bool IsSplat = true;
6749 bool HasConstElts = false;
6751 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6752 SDValue In = Op.getOperand(idx);
6755 if (!isa<ConstantSDNode>(In))
6756 NonConstIdx.push_back(idx);
6758 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6759 HasConstElts = true;
6763 else if (In != Op.getOperand(SplatIdx))
6767 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6769 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6770 DAG.getConstant(1, dl, VT),
6771 DAG.getConstant(0, dl, VT));
6773 // insert elements one by one
6777 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6778 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6780 else if (HasConstElts)
6781 Imm = DAG.getConstant(0, dl, VT);
6783 Imm = DAG.getUNDEF(VT);
6784 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6785 DstVec = DAG.getBitcast(VT, Imm);
6787 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6788 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6789 DAG.getIntPtrConstant(0, dl));
6792 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6793 unsigned InsertIdx = NonConstIdx[i];
6794 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6795 Op.getOperand(InsertIdx),
6796 DAG.getIntPtrConstant(InsertIdx, dl));
6801 /// \brief Return true if \p N implements a horizontal binop and return the
6802 /// operands for the horizontal binop into V0 and V1.
6804 /// This is a helper function of LowerToHorizontalOp().
6805 /// This function checks that the build_vector \p N in input implements a
6806 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6807 /// operation to match.
6808 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6809 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6810 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6813 /// This function only analyzes elements of \p N whose indices are
6814 /// in range [BaseIdx, LastIdx).
6815 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6817 unsigned BaseIdx, unsigned LastIdx,
6818 SDValue &V0, SDValue &V1) {
6819 EVT VT = N->getValueType(0);
6821 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6822 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6823 "Invalid Vector in input!");
6825 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6826 bool CanFold = true;
6827 unsigned ExpectedVExtractIdx = BaseIdx;
6828 unsigned NumElts = LastIdx - BaseIdx;
6829 V0 = DAG.getUNDEF(VT);
6830 V1 = DAG.getUNDEF(VT);
6832 // Check if N implements a horizontal binop.
6833 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6834 SDValue Op = N->getOperand(i + BaseIdx);
6837 if (Op->isUndef()) {
6838 // Update the expected vector extract index.
6839 if (i * 2 == NumElts)
6840 ExpectedVExtractIdx = BaseIdx;
6841 ExpectedVExtractIdx += 2;
6845 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6850 SDValue Op0 = Op.getOperand(0);
6851 SDValue Op1 = Op.getOperand(1);
6853 // Try to match the following pattern:
6854 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6855 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6856 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6857 Op0.getOperand(0) == Op1.getOperand(0) &&
6858 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6859 isa<ConstantSDNode>(Op1.getOperand(1)));
6863 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6864 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6866 if (i * 2 < NumElts) {
6868 V0 = Op0.getOperand(0);
6869 if (V0.getValueType() != VT)
6874 V1 = Op0.getOperand(0);
6875 if (V1.getValueType() != VT)
6878 if (i * 2 == NumElts)
6879 ExpectedVExtractIdx = BaseIdx;
6882 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6883 if (I0 == ExpectedVExtractIdx)
6884 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6885 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6886 // Try to match the following dag sequence:
6887 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6888 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6892 ExpectedVExtractIdx += 2;
6898 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6899 /// a concat_vector.
6901 /// This is a helper function of LowerToHorizontalOp().
6902 /// This function expects two 256-bit vectors called V0 and V1.
6903 /// At first, each vector is split into two separate 128-bit vectors.
6904 /// Then, the resulting 128-bit vectors are used to implement two
6905 /// horizontal binary operations.
6907 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6909 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6910 /// the two new horizontal binop.
6911 /// When Mode is set, the first horizontal binop dag node would take as input
6912 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6913 /// horizontal binop dag node would take as input the lower 128-bit of V1
6914 /// and the upper 128-bit of V1.
6916 /// HADD V0_LO, V0_HI
6917 /// HADD V1_LO, V1_HI
6919 /// Otherwise, the first horizontal binop dag node takes as input the lower
6920 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6921 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6923 /// HADD V0_LO, V1_LO
6924 /// HADD V0_HI, V1_HI
6926 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6927 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6928 /// the upper 128-bits of the result.
6929 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6930 const SDLoc &DL, SelectionDAG &DAG,
6931 unsigned X86Opcode, bool Mode,
6932 bool isUndefLO, bool isUndefHI) {
6933 MVT VT = V0.getSimpleValueType();
6934 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6935 "Invalid nodes in input!");
6937 unsigned NumElts = VT.getVectorNumElements();
6938 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6939 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6940 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6941 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6942 MVT NewVT = V0_LO.getSimpleValueType();
6944 SDValue LO = DAG.getUNDEF(NewVT);
6945 SDValue HI = DAG.getUNDEF(NewVT);
6948 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6949 if (!isUndefLO && !V0->isUndef())
6950 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6951 if (!isUndefHI && !V1->isUndef())
6952 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6954 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6955 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6956 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6958 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6959 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6962 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6965 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6967 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6968 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6969 MVT VT = BV->getSimpleValueType(0);
6970 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6971 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6975 unsigned NumElts = VT.getVectorNumElements();
6976 SDValue InVec0 = DAG.getUNDEF(VT);
6977 SDValue InVec1 = DAG.getUNDEF(VT);
6979 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6980 VT == MVT::v2f64) && "build_vector with an invalid type found!");
6982 // Odd-numbered elements in the input build vector are obtained from
6983 // adding two integer/float elements.
6984 // Even-numbered elements in the input build vector are obtained from
6985 // subtracting two integer/float elements.
6986 unsigned ExpectedOpcode = ISD::FSUB;
6987 unsigned NextExpectedOpcode = ISD::FADD;
6988 bool AddFound = false;
6989 bool SubFound = false;
6991 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6992 SDValue Op = BV->getOperand(i);
6994 // Skip 'undef' values.
6995 unsigned Opcode = Op.getOpcode();
6996 if (Opcode == ISD::UNDEF) {
6997 std::swap(ExpectedOpcode, NextExpectedOpcode);
7001 // Early exit if we found an unexpected opcode.
7002 if (Opcode != ExpectedOpcode)
7005 SDValue Op0 = Op.getOperand(0);
7006 SDValue Op1 = Op.getOperand(1);
7008 // Try to match the following pattern:
7009 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7010 // Early exit if we cannot match that sequence.
7011 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7012 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7013 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7014 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7015 Op0.getOperand(1) != Op1.getOperand(1))
7018 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7022 // We found a valid add/sub node. Update the information accordingly.
7028 // Update InVec0 and InVec1.
7029 if (InVec0.isUndef()) {
7030 InVec0 = Op0.getOperand(0);
7031 if (InVec0.getSimpleValueType() != VT)
7034 if (InVec1.isUndef()) {
7035 InVec1 = Op1.getOperand(0);
7036 if (InVec1.getSimpleValueType() != VT)
7040 // Make sure that operands in input to each add/sub node always
7041 // come from a same pair of vectors.
7042 if (InVec0 != Op0.getOperand(0)) {
7043 if (ExpectedOpcode == ISD::FSUB)
7046 // FADD is commutable. Try to commute the operands
7047 // and then test again.
7048 std::swap(Op0, Op1);
7049 if (InVec0 != Op0.getOperand(0))
7053 if (InVec1 != Op1.getOperand(0))
7056 // Update the pair of expected opcodes.
7057 std::swap(ExpectedOpcode, NextExpectedOpcode);
7060 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7061 if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
7062 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
7067 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7068 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7069 const X86Subtarget &Subtarget,
7070 SelectionDAG &DAG) {
7071 MVT VT = BV->getSimpleValueType(0);
7072 unsigned NumElts = VT.getVectorNumElements();
7073 unsigned NumUndefsLO = 0;
7074 unsigned NumUndefsHI = 0;
7075 unsigned Half = NumElts/2;
7077 // Count the number of UNDEF operands in the build_vector in input.
7078 for (unsigned i = 0, e = Half; i != e; ++i)
7079 if (BV->getOperand(i)->isUndef())
7082 for (unsigned i = Half, e = NumElts; i != e; ++i)
7083 if (BV->getOperand(i)->isUndef())
7086 // Early exit if this is either a build_vector of all UNDEFs or all the
7087 // operands but one are UNDEF.
7088 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7092 SDValue InVec0, InVec1;
7093 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7094 // Try to match an SSE3 float HADD/HSUB.
7095 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7096 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7098 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7099 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7100 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7101 // Try to match an SSSE3 integer HADD/HSUB.
7102 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7103 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7105 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7106 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7109 if (!Subtarget.hasAVX())
7112 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7113 // Try to match an AVX horizontal add/sub of packed single/double
7114 // precision floating point values from 256-bit vectors.
7115 SDValue InVec2, InVec3;
7116 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7117 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7118 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7119 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7120 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7122 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7123 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7124 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7125 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7126 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7127 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7128 // Try to match an AVX2 horizontal add/sub of signed integers.
7129 SDValue InVec2, InVec3;
7131 bool CanFold = true;
7133 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7134 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7135 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7136 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7137 X86Opcode = X86ISD::HADD;
7138 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7139 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7140 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7141 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7142 X86Opcode = X86ISD::HSUB;
7147 // Fold this build_vector into a single horizontal add/sub.
7148 // Do this only if the target has AVX2.
7149 if (Subtarget.hasAVX2())
7150 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7152 // Do not try to expand this build_vector into a pair of horizontal
7153 // add/sub if we can emit a pair of scalar add/sub.
7154 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7157 // Convert this build_vector into a pair of horizontal binop followed by
7159 bool isUndefLO = NumUndefsLO == Half;
7160 bool isUndefHI = NumUndefsHI == Half;
7161 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7162 isUndefLO, isUndefHI);
7166 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7167 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7169 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7170 X86Opcode = X86ISD::HADD;
7171 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7172 X86Opcode = X86ISD::HSUB;
7173 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7174 X86Opcode = X86ISD::FHADD;
7175 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7176 X86Opcode = X86ISD::FHSUB;
7180 // Don't try to expand this build_vector into a pair of horizontal add/sub
7181 // if we can simply emit a pair of scalar add/sub.
7182 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7185 // Convert this build_vector into two horizontal add/sub followed by
7187 bool isUndefLO = NumUndefsLO == Half;
7188 bool isUndefHI = NumUndefsHI == Half;
7189 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7190 isUndefLO, isUndefHI);
7196 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7197 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7198 /// just apply the bit to the vectors.
7199 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7200 /// from this, but enough scalar bit operations are created from the later
7201 /// legalization + scalarization stages to need basic support.
7202 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7203 SelectionDAG &DAG) {
7205 MVT VT = Op->getSimpleValueType(0);
7206 unsigned NumElems = VT.getVectorNumElements();
7207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7209 // Check that all elements have the same opcode.
7210 // TODO: Should we allow UNDEFS and if so how many?
7211 unsigned Opcode = Op->getOperand(0).getOpcode();
7212 for (unsigned i = 1; i < NumElems; ++i)
7213 if (Opcode != Op->getOperand(i).getOpcode())
7216 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7223 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7228 SmallVector<SDValue, 4> LHSElts, RHSElts;
7229 for (SDValue Elt : Op->ops()) {
7230 SDValue LHS = Elt.getOperand(0);
7231 SDValue RHS = Elt.getOperand(1);
7233 // We expect the canonicalized RHS operand to be the constant.
7234 if (!isa<ConstantSDNode>(RHS))
7236 LHSElts.push_back(LHS);
7237 RHSElts.push_back(RHS);
7240 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7241 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7242 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7245 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7246 /// functionality to do this, so it's all zeros, all ones, or some derivation
7247 /// that is cheap to calculate.
7248 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7249 const X86Subtarget &Subtarget) {
7251 MVT VT = Op.getSimpleValueType();
7253 // Vectors containing all zeros can be matched by pxor and xorps.
7254 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7255 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7256 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7257 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7260 return getZeroVector(VT, Subtarget, DAG, DL);
7263 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7264 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7265 // vpcmpeqd on 256-bit vectors.
7266 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7267 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7268 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7271 return getOnesVector(VT, Subtarget, DAG, DL);
7278 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7281 MVT VT = Op.getSimpleValueType();
7282 MVT ExtVT = VT.getVectorElementType();
7283 unsigned NumElems = Op.getNumOperands();
7285 // Generate vectors for predicate vectors.
7286 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7287 return LowerBUILD_VECTORvXi1(Op, DAG);
7289 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7290 return VectorConstant;
7292 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7293 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
7295 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7296 return HorizontalOp;
7297 if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
7299 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7302 unsigned EVTBits = ExtVT.getSizeInBits();
7304 unsigned NumZero = 0;
7305 unsigned NumNonZero = 0;
7306 uint64_t NonZeros = 0;
7307 bool IsAllConstants = true;
7308 SmallSet<SDValue, 8> Values;
7309 for (unsigned i = 0; i < NumElems; ++i) {
7310 SDValue Elt = Op.getOperand(i);
7314 if (Elt.getOpcode() != ISD::Constant &&
7315 Elt.getOpcode() != ISD::ConstantFP)
7316 IsAllConstants = false;
7317 if (X86::isZeroNode(Elt))
7320 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7321 NonZeros |= ((uint64_t)1 << i);
7326 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7327 if (NumNonZero == 0)
7328 return DAG.getUNDEF(VT);
7330 // Special case for single non-zero, non-undef, element.
7331 if (NumNonZero == 1) {
7332 unsigned Idx = countTrailingZeros(NonZeros);
7333 SDValue Item = Op.getOperand(Idx);
7335 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7336 // the value are obviously zero, truncate the value to i32 and do the
7337 // insertion that way. Only do this if the value is non-constant or if the
7338 // value is a constant being inserted into element 0. It is cheaper to do
7339 // a constant pool load than it is to do a movd + shuffle.
7340 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7341 (!IsAllConstants || Idx == 0)) {
7342 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
7344 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7345 MVT VecVT = MVT::v4i32;
7347 // Truncate the value (which may itself be a constant) to i32, and
7348 // convert it to a vector with movd (S2V+shuffle to zero extend).
7349 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7350 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7351 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7352 Item, Idx * 2, true, Subtarget, DAG));
7356 // If we have a constant or non-constant insertion into the low element of
7357 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7358 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7359 // depending on what the source datatype is.
7362 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7364 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7365 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7366 assert((VT.is128BitVector() || VT.is256BitVector() ||
7367 VT.is512BitVector()) &&
7368 "Expected an SSE value type!");
7369 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7370 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7371 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7374 // We can't directly insert an i8 or i16 into a vector, so zero extend
7376 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7377 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7378 if (VT.getSizeInBits() >= 256) {
7379 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7380 if (Subtarget.hasAVX()) {
7381 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7382 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7384 // Without AVX, we need to extend to a 128-bit vector and then
7385 // insert into the 256-bit vector.
7386 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7387 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7388 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7391 assert(VT.is128BitVector() && "Expected an SSE value type!");
7392 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7393 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7395 return DAG.getBitcast(VT, Item);
7399 // Is it a vector logical left shift?
7400 if (NumElems == 2 && Idx == 1 &&
7401 X86::isZeroNode(Op.getOperand(0)) &&
7402 !X86::isZeroNode(Op.getOperand(1))) {
7403 unsigned NumBits = VT.getSizeInBits();
7404 return getVShift(true, VT,
7405 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7406 VT, Op.getOperand(1)),
7407 NumBits/2, DAG, *this, dl);
7410 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7413 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7414 // is a non-constant being inserted into an element other than the low one,
7415 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7416 // movd/movss) to move this into the low element, then shuffle it into
7418 if (EVTBits == 32) {
7419 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7420 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7424 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7425 if (Values.size() == 1) {
7426 if (EVTBits == 32) {
7427 // Instead of a shuffle like this:
7428 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7429 // Check if it's possible to issue this instead.
7430 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7431 unsigned Idx = countTrailingZeros(NonZeros);
7432 SDValue Item = Op.getOperand(Idx);
7433 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7434 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7439 // A vector full of immediates; various special cases are already
7440 // handled, so this is best done with a single constant-pool load.
7444 // See if we can use a vector load to get all of the elements.
7445 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7446 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7447 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7451 // For AVX-length vectors, build the individual 128-bit pieces and use
7452 // shuffles to put them in place.
7453 if (VT.is256BitVector() || VT.is512BitVector()) {
7454 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7456 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7458 // Build both the lower and upper subvector.
7460 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7461 SDValue Upper = DAG.getBuildVector(
7462 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7464 // Recreate the wider vector with the lower and upper part.
7465 if (VT.is256BitVector())
7466 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7467 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7470 // Let legalizer expand 2-wide build_vectors.
7471 if (EVTBits == 64) {
7472 if (NumNonZero == 1) {
7473 // One half is zero or undef.
7474 unsigned Idx = countTrailingZeros(NonZeros);
7475 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7476 Op.getOperand(Idx));
7477 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7482 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7483 if (EVTBits == 8 && NumElems == 16)
7484 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7485 DAG, Subtarget, *this))
7488 if (EVTBits == 16 && NumElems == 8)
7489 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7490 DAG, Subtarget, *this))
7493 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7494 if (EVTBits == 32 && NumElems == 4)
7495 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
7498 // If element VT is == 32 bits, turn it into a number of shuffles.
7499 if (NumElems == 4 && NumZero > 0) {
7500 SmallVector<SDValue, 8> Ops(NumElems);
7501 for (unsigned i = 0; i < 4; ++i) {
7502 bool isZero = !(NonZeros & (1ULL << i));
7504 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7506 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7509 for (unsigned i = 0; i < 2; ++i) {
7510 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7513 Ops[i] = Ops[i*2]; // Must be a zero vector.
7516 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7519 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7522 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7527 bool Reverse1 = (NonZeros & 0x3) == 2;
7528 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7532 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7533 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7535 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7538 if (Values.size() > 1 && VT.is128BitVector()) {
7539 // Check for a build vector from mostly shuffle plus few inserting.
7540 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7543 // For SSE 4.1, use insertps to put the high elements into the low element.
7544 if (Subtarget.hasSSE41()) {
7546 if (!Op.getOperand(0).isUndef())
7547 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7549 Result = DAG.getUNDEF(VT);
7551 for (unsigned i = 1; i < NumElems; ++i) {
7552 if (Op.getOperand(i).isUndef()) continue;
7553 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7554 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7559 // Otherwise, expand into a number of unpckl*, start by extending each of
7560 // our (non-undef) elements to the full vector width with the element in the
7561 // bottom slot of the vector (which generates no code for SSE).
7562 SmallVector<SDValue, 8> Ops(NumElems);
7563 for (unsigned i = 0; i < NumElems; ++i) {
7564 if (!Op.getOperand(i).isUndef())
7565 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7567 Ops[i] = DAG.getUNDEF(VT);
7570 // Next, we iteratively mix elements, e.g. for v4f32:
7571 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7572 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7573 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7574 unsigned EltStride = NumElems >> 1;
7575 while (EltStride != 0) {
7576 for (unsigned i = 0; i < EltStride; ++i) {
7577 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7578 // then it is safe to just drop this shuffle: V[i] is already in the
7579 // right place, the one element (since it's the first round) being
7580 // inserted as undef can be dropped. This isn't safe for successive
7581 // rounds because they will permute elements within both vectors.
7582 if (Ops[i+EltStride].isUndef() &&
7583 EltStride == NumElems/2)
7586 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7595 // 256-bit AVX can use the vinsertf128 instruction
7596 // to create 256-bit vectors from two other 128-bit ones.
7597 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7599 MVT ResVT = Op.getSimpleValueType();
7601 assert((ResVT.is256BitVector() ||
7602 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7604 SDValue V1 = Op.getOperand(0);
7605 SDValue V2 = Op.getOperand(1);
7606 unsigned NumElems = ResVT.getVectorNumElements();
7607 if (ResVT.is256BitVector())
7608 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7610 if (Op.getNumOperands() == 4) {
7611 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7612 ResVT.getVectorNumElements()/2);
7613 SDValue V3 = Op.getOperand(2);
7614 SDValue V4 = Op.getOperand(3);
7615 return concat256BitVectors(
7616 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7617 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7620 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7623 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7624 const X86Subtarget &Subtarget,
7625 SelectionDAG & DAG) {
7627 MVT ResVT = Op.getSimpleValueType();
7628 unsigned NumOfOperands = Op.getNumOperands();
7630 assert(isPowerOf2_32(NumOfOperands) &&
7631 "Unexpected number of operands in CONCAT_VECTORS");
7633 SDValue Undef = DAG.getUNDEF(ResVT);
7634 if (NumOfOperands > 2) {
7635 // Specialize the cases when all, or all but one, of the operands are undef.
7636 unsigned NumOfDefinedOps = 0;
7638 for (unsigned i = 0; i < NumOfOperands; i++)
7639 if (!Op.getOperand(i).isUndef()) {
7643 if (NumOfDefinedOps == 0)
7645 if (NumOfDefinedOps == 1) {
7646 unsigned SubVecNumElts =
7647 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7648 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7649 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7650 Op.getOperand(OpIdx), IdxVal);
7653 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7654 ResVT.getVectorNumElements()/2);
7655 SmallVector<SDValue, 2> Ops;
7656 for (unsigned i = 0; i < NumOfOperands/2; i++)
7657 Ops.push_back(Op.getOperand(i));
7658 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7660 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7661 Ops.push_back(Op.getOperand(i));
7662 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7663 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7667 SDValue V1 = Op.getOperand(0);
7668 SDValue V2 = Op.getOperand(1);
7669 unsigned NumElems = ResVT.getVectorNumElements();
7670 assert(V1.getValueType() == V2.getValueType() &&
7671 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7672 "Unexpected operands in CONCAT_VECTORS");
7674 if (ResVT.getSizeInBits() >= 16)
7675 return Op; // The operation is legal with KUNPCK
7677 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7678 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7679 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7680 if (IsZeroV1 && IsZeroV2)
7683 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7685 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7687 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7689 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7691 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7694 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7696 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7697 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7700 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7701 const X86Subtarget &Subtarget,
7702 SelectionDAG &DAG) {
7703 MVT VT = Op.getSimpleValueType();
7704 if (VT.getVectorElementType() == MVT::i1)
7705 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7707 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7708 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7709 Op.getNumOperands() == 4)));
7711 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7712 // from two other 128-bit ones.
7714 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7715 return LowerAVXCONCAT_VECTORS(Op, DAG);
7718 //===----------------------------------------------------------------------===//
7719 // Vector shuffle lowering
7721 // This is an experimental code path for lowering vector shuffles on x86. It is
7722 // designed to handle arbitrary vector shuffles and blends, gracefully
7723 // degrading performance as necessary. It works hard to recognize idiomatic
7724 // shuffles and lower them to optimal instruction patterns without leaving
7725 // a framework that allows reasonably efficient handling of all vector shuffle
7727 //===----------------------------------------------------------------------===//
7729 /// \brief Tiny helper function to identify a no-op mask.
7731 /// This is a somewhat boring predicate function. It checks whether the mask
7732 /// array input, which is assumed to be a single-input shuffle mask of the kind
7733 /// used by the X86 shuffle instructions (not a fully general
7734 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7735 /// in-place shuffle are 'no-op's.
7736 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7737 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7738 assert(Mask[i] >= -1 && "Out of bound mask element!");
7739 if (Mask[i] >= 0 && Mask[i] != i)
7745 /// \brief Test whether there are elements crossing 128-bit lanes in this
7748 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7749 /// and we routinely test for these.
7750 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7751 int LaneSize = 128 / VT.getScalarSizeInBits();
7752 int Size = Mask.size();
7753 for (int i = 0; i < Size; ++i)
7754 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7759 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7761 /// This checks a shuffle mask to see if it is performing the same
7762 /// lane-relative shuffle in each sub-lane. This trivially implies
7763 /// that it is also not lane-crossing. It may however involve a blend from the
7764 /// same lane of a second vector.
7766 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7767 /// non-trivial to compute in the face of undef lanes. The representation is
7768 /// suitable for use with existing 128-bit shuffles as entries from the second
7769 /// vector have been remapped to [LaneSize, 2*LaneSize).
7770 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7772 SmallVectorImpl<int> &RepeatedMask) {
7773 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7774 RepeatedMask.assign(LaneSize, -1);
7775 int Size = Mask.size();
7776 for (int i = 0; i < Size; ++i) {
7777 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
7780 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7781 // This entry crosses lanes, so there is no way to model this shuffle.
7784 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7785 // Adjust second vector indices to start at LaneSize instead of Size.
7786 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7787 : Mask[i] % LaneSize + LaneSize;
7788 if (RepeatedMask[i % LaneSize] < 0)
7789 // This is the first non-undef entry in this slot of a 128-bit lane.
7790 RepeatedMask[i % LaneSize] = LocalM;
7791 else if (RepeatedMask[i % LaneSize] != LocalM)
7792 // Found a mismatch with the repeated mask.
7798 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7800 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7801 SmallVectorImpl<int> &RepeatedMask) {
7802 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7805 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7807 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7808 SmallVectorImpl<int> &RepeatedMask) {
7809 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7812 /// Test whether a target shuffle mask is equivalent within each sub-lane.
7813 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
7814 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
7816 SmallVectorImpl<int> &RepeatedMask) {
7817 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7818 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
7819 int Size = Mask.size();
7820 for (int i = 0; i < Size; ++i) {
7821 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
7822 if (Mask[i] == SM_SentinelUndef)
7824 if (Mask[i] == SM_SentinelZero) {
7825 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
7827 RepeatedMask[i % LaneSize] = SM_SentinelZero;
7830 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7831 // This entry crosses lanes, so there is no way to model this shuffle.
7834 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7835 // Adjust second vector indices to start at LaneSize instead of Size.
7837 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
7838 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
7839 // This is the first non-undef entry in this slot of a 128-bit lane.
7840 RepeatedMask[i % LaneSize] = LocalM;
7841 else if (RepeatedMask[i % LaneSize] != LocalM)
7842 // Found a mismatch with the repeated mask.
7848 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7851 /// This is a fast way to test a shuffle mask against a fixed pattern:
7853 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7855 /// It returns true if the mask is exactly as wide as the argument list, and
7856 /// each element of the mask is either -1 (signifying undef) or the value given
7857 /// in the argument.
7858 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7859 ArrayRef<int> ExpectedMask) {
7860 if (Mask.size() != ExpectedMask.size())
7863 int Size = Mask.size();
7865 // If the values are build vectors, we can look through them to find
7866 // equivalent inputs that make the shuffles equivalent.
7867 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7868 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7870 for (int i = 0; i < Size; ++i) {
7871 assert(Mask[i] >= -1 && "Out of bound mask element!");
7872 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7873 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7874 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7875 if (!MaskBV || !ExpectedBV ||
7876 MaskBV->getOperand(Mask[i] % Size) !=
7877 ExpectedBV->getOperand(ExpectedMask[i] % Size))
7885 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7887 /// The masks must be exactly the same width.
7889 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7890 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7892 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
7893 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7894 ArrayRef<int> ExpectedMask) {
7895 int Size = Mask.size();
7896 if (Size != (int)ExpectedMask.size())
7899 for (int i = 0; i < Size; ++i)
7900 if (Mask[i] == SM_SentinelUndef)
7902 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7904 else if (Mask[i] != ExpectedMask[i])
7910 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7912 /// This helper function produces an 8-bit shuffle immediate corresponding to
7913 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7914 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7917 /// NB: We rely heavily on "undef" masks preserving the input lane.
7918 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7919 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7920 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7921 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7922 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7923 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7926 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7927 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7928 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7929 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7933 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7934 SelectionDAG &DAG) {
7935 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7938 /// \brief Compute whether each element of a shuffle is zeroable.
7940 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7941 /// Either it is an undef element in the shuffle mask, the element of the input
7942 /// referenced is undef, or the element of the input referenced is known to be
7943 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7944 /// as many lanes with this technique as possible to simplify the remaining
7946 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7947 SDValue V1, SDValue V2) {
7948 SmallBitVector Zeroable(Mask.size(), false);
7949 V1 = peekThroughBitcasts(V1);
7950 V2 = peekThroughBitcasts(V2);
7952 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7953 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7955 int VectorSizeInBits = V1.getValueSizeInBits();
7956 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7957 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7959 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7961 // Handle the easy cases.
7962 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7967 // Determine shuffle input and normalize the mask.
7968 SDValue V = M < Size ? V1 : V2;
7971 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7972 if (V.getOpcode() != ISD::BUILD_VECTOR)
7975 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7976 // the (larger) source element must be UNDEF/ZERO.
7977 if ((Size % V.getNumOperands()) == 0) {
7978 int Scale = Size / V->getNumOperands();
7979 SDValue Op = V.getOperand(M / Scale);
7980 if (Op.isUndef() || X86::isZeroNode(Op))
7982 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7983 APInt Val = Cst->getAPIntValue();
7984 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7985 Val = Val.getLoBits(ScalarSizeInBits);
7986 Zeroable[i] = (Val == 0);
7987 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7988 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7989 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7990 Val = Val.getLoBits(ScalarSizeInBits);
7991 Zeroable[i] = (Val == 0);
7996 // If the BUILD_VECTOR has more elements then all the (smaller) source
7997 // elements must be UNDEF or ZERO.
7998 if ((V.getNumOperands() % Size) == 0) {
7999 int Scale = V->getNumOperands() / Size;
8000 bool AllZeroable = true;
8001 for (int j = 0; j < Scale; ++j) {
8002 SDValue Op = V.getOperand((M * Scale) + j);
8003 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8005 Zeroable[i] = AllZeroable;
8013 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8014 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8015 ArrayRef<int> Mask, SDValue V1,
8017 const SmallBitVector &Zeroable,
8018 const X86Subtarget &Subtarget,
8019 SelectionDAG &DAG) {
8020 int Size = Mask.size();
8021 int LaneSize = 128 / VT.getScalarSizeInBits();
8022 const int NumBytes = VT.getSizeInBits() / 8;
8023 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8025 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8026 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8027 (Subtarget.hasBWI() && VT.is512BitVector()));
8029 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8030 // Sign bit set in i8 mask means zero element.
8031 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8034 for (int i = 0; i < NumBytes; ++i) {
8035 int M = Mask[i / NumEltBytes];
8037 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8040 if (Zeroable[i / NumEltBytes]) {
8041 PSHUFBMask[i] = ZeroMask;
8045 // We can only use a single input of V1 or V2.
8046 SDValue SrcV = (M >= Size ? V2 : V1);
8052 // PSHUFB can't cross lanes, ensure this doesn't happen.
8053 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8057 M = M * NumEltBytes + (i % NumEltBytes);
8058 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8060 assert(V && "Failed to find a source input");
8062 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8063 return DAG.getBitcast(
8064 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8065 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8068 // X86 has dedicated unpack instructions that can handle specific blend
8069 // operations: UNPCKH and UNPCKL.
8070 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8071 ArrayRef<int> Mask, SDValue V1,
8072 SDValue V2, SelectionDAG &DAG) {
8073 SmallVector<int, 8> Unpckl;
8074 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8075 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8076 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8078 SmallVector<int, 8> Unpckh;
8079 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8080 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8081 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8083 // Commute and try again.
8084 ShuffleVectorSDNode::commuteMask(Unpckl);
8085 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8086 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8088 ShuffleVectorSDNode::commuteMask(Unpckh);
8089 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8090 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8095 /// \brief Try to emit a bitmask instruction for a shuffle.
8097 /// This handles cases where we can model a blend exactly as a bitmask due to
8098 /// one of the inputs being zeroable.
8099 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8100 SDValue V2, ArrayRef<int> Mask,
8101 const SmallBitVector &Zeroable,
8102 SelectionDAG &DAG) {
8103 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8104 MVT EltVT = VT.getVectorElementType();
8105 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8107 DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
8108 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8110 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8113 if (Mask[i] % Size != i)
8114 return SDValue(); // Not a blend.
8116 V = Mask[i] < Size ? V1 : V2;
8117 else if (V != (Mask[i] < Size ? V1 : V2))
8118 return SDValue(); // Can only let one input through the mask.
8120 VMaskOps[i] = AllOnes;
8123 return SDValue(); // No non-zeroable elements!
8125 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8126 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8129 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8131 /// This is used as a fallback approach when first class blend instructions are
8132 /// unavailable. Currently it is only suitable for integer vectors, but could
8133 /// be generalized for floating point vectors if desirable.
8134 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8135 SDValue V2, ArrayRef<int> Mask,
8136 SelectionDAG &DAG) {
8137 assert(VT.isInteger() && "Only supports integer vector types!");
8138 MVT EltVT = VT.getVectorElementType();
8139 int NumEltBits = EltVT.getSizeInBits();
8140 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8141 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
8143 SmallVector<SDValue, 16> MaskOps;
8144 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8145 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8146 return SDValue(); // Shuffled input!
8147 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8150 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8151 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8152 // We have to cast V2 around.
8153 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8154 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8155 DAG.getBitcast(MaskVT, V1Mask),
8156 DAG.getBitcast(MaskVT, V2)));
8157 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8160 /// \brief Try to emit a blend instruction for a shuffle.
8162 /// This doesn't do any checks for the availability of instructions for blending
8163 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8164 /// be matched in the backend with the type given. What it does check for is
8165 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8166 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8167 SDValue V2, ArrayRef<int> Original,
8168 const SmallBitVector &Zeroable,
8169 const X86Subtarget &Subtarget,
8170 SelectionDAG &DAG) {
8171 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8172 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8173 SmallVector<int, 8> Mask(Original.begin(), Original.end());
8174 bool ForceV1Zero = false, ForceV2Zero = false;
8176 // Attempt to generate the binary blend mask. If an input is zero then
8177 // we can use any lane.
8178 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8179 unsigned BlendMask = 0;
8180 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8186 if (M == i + Size) {
8187 BlendMask |= 1u << i;
8198 BlendMask |= 1u << i;
8203 return SDValue(); // Shuffled input!
8206 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8208 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8210 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8212 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
8213 unsigned ScaledMask = 0;
8214 for (int i = 0; i != Size; ++i)
8215 if (BlendMask & (1u << i))
8216 for (int j = 0; j != Scale; ++j)
8217 ScaledMask |= 1u << (i * Scale + j);
8221 switch (VT.SimpleTy) {
8226 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8227 DAG.getConstant(BlendMask, DL, MVT::i8));
8231 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8235 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8236 // that instruction.
8237 if (Subtarget.hasAVX2()) {
8238 // Scale the blend by the number of 32-bit dwords per element.
8239 int Scale = VT.getScalarSizeInBits() / 32;
8240 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8241 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8242 V1 = DAG.getBitcast(BlendVT, V1);
8243 V2 = DAG.getBitcast(BlendVT, V2);
8244 return DAG.getBitcast(
8245 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8246 DAG.getConstant(BlendMask, DL, MVT::i8)));
8250 // For integer shuffles we need to expand the mask and cast the inputs to
8251 // v8i16s prior to blending.
8252 int Scale = 8 / VT.getVectorNumElements();
8253 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8254 V1 = DAG.getBitcast(MVT::v8i16, V1);
8255 V2 = DAG.getBitcast(MVT::v8i16, V2);
8256 return DAG.getBitcast(VT,
8257 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8258 DAG.getConstant(BlendMask, DL, MVT::i8)));
8262 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8263 SmallVector<int, 8> RepeatedMask;
8264 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8265 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8266 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8268 for (int i = 0; i < 8; ++i)
8269 if (RepeatedMask[i] >= 8)
8270 BlendMask |= 1u << i;
8271 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8272 DAG.getConstant(BlendMask, DL, MVT::i8));
8278 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8279 "256-bit byte-blends require AVX2 support!");
8281 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8282 if (SDValue Masked =
8283 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8286 // Scale the blend by the number of bytes per element.
8287 int Scale = VT.getScalarSizeInBits() / 8;
8289 // This form of blend is always done on bytes. Compute the byte vector
8291 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8293 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8294 // mix of LLVM's code generator and the x86 backend. We tell the code
8295 // generator that boolean values in the elements of an x86 vector register
8296 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8297 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8298 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8299 // of the element (the remaining are ignored) and 0 in that high bit would
8300 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8301 // the LLVM model for boolean values in vector elements gets the relevant
8302 // bit set, it is set backwards and over constrained relative to x86's
8304 SmallVector<SDValue, 32> VSELECTMask;
8305 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8306 for (int j = 0; j < Scale; ++j)
8307 VSELECTMask.push_back(
8308 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8309 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8312 V1 = DAG.getBitcast(BlendVT, V1);
8313 V2 = DAG.getBitcast(BlendVT, V2);
8314 return DAG.getBitcast(
8315 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8316 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8320 llvm_unreachable("Not a supported integer vector type!");
8324 /// \brief Try to lower as a blend of elements from two inputs followed by
8325 /// a single-input permutation.
8327 /// This matches the pattern where we can blend elements from two inputs and
8328 /// then reduce the shuffle to a single-input permutation.
8329 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8330 SDValue V1, SDValue V2,
8332 SelectionDAG &DAG) {
8333 // We build up the blend mask while checking whether a blend is a viable way
8334 // to reduce the shuffle.
8335 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8336 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8338 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8342 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8344 if (BlendMask[Mask[i] % Size] < 0)
8345 BlendMask[Mask[i] % Size] = Mask[i];
8346 else if (BlendMask[Mask[i] % Size] != Mask[i])
8347 return SDValue(); // Can't blend in the needed input!
8349 PermuteMask[i] = Mask[i] % Size;
8352 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8353 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8356 /// \brief Generic routine to decompose a shuffle and blend into indepndent
8357 /// blends and permutes.
8359 /// This matches the extremely common pattern for handling combined
8360 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8361 /// operations. It will try to pick the best arrangement of shuffles and
8363 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8367 SelectionDAG &DAG) {
8368 // Shuffle the input elements into the desired positions in V1 and V2 and
8369 // blend them together.
8370 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8371 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8372 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8373 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8374 if (Mask[i] >= 0 && Mask[i] < Size) {
8375 V1Mask[i] = Mask[i];
8377 } else if (Mask[i] >= Size) {
8378 V2Mask[i] = Mask[i] - Size;
8379 BlendMask[i] = i + Size;
8382 // Try to lower with the simpler initial blend strategy unless one of the
8383 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8384 // shuffle may be able to fold with a load or other benefit. However, when
8385 // we'll have to do 2x as many shuffles in order to achieve this, blending
8386 // first is a better strategy.
8387 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8388 if (SDValue BlendPerm =
8389 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8392 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8393 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8394 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8397 /// \brief Try to lower a vector shuffle as a rotation.
8399 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8400 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8401 ArrayRef<int> Mask) {
8402 int NumElts = Mask.size();
8404 // We need to detect various ways of spelling a rotation:
8405 // [11, 12, 13, 14, 15, 0, 1, 2]
8406 // [-1, 12, 13, 14, -1, -1, 1, -1]
8407 // [-1, -1, -1, -1, -1, -1, 1, 2]
8408 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8409 // [-1, 4, 5, 6, -1, -1, 9, -1]
8410 // [-1, 4, 5, 6, -1, -1, -1, -1]
8413 for (int i = 0; i < NumElts; ++i) {
8415 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8416 "Unexpected mask index.");
8420 // Determine where a rotated vector would have started.
8421 int StartIdx = i - (M % NumElts);
8423 // The identity rotation isn't interesting, stop.
8426 // If we found the tail of a vector the rotation must be the missing
8427 // front. If we found the head of a vector, it must be how much of the
8429 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8432 Rotation = CandidateRotation;
8433 else if (Rotation != CandidateRotation)
8434 // The rotations don't match, so we can't match this mask.
8437 // Compute which value this mask is pointing at.
8438 SDValue MaskV = M < NumElts ? V1 : V2;
8440 // Compute which of the two target values this index should be assigned
8441 // to. This reflects whether the high elements are remaining or the low
8442 // elements are remaining.
8443 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8445 // Either set up this value if we've not encountered it before, or check
8446 // that it remains consistent.
8449 else if (TargetV != MaskV)
8450 // This may be a rotation, but it pulls from the inputs in some
8451 // unsupported interleaving.
8455 // Check that we successfully analyzed the mask, and normalize the results.
8456 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8457 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8469 /// \brief Try to lower a vector shuffle as a byte rotation.
8471 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8472 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8473 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8474 /// try to generically lower a vector shuffle through such an pattern. It
8475 /// does not check for the profitability of lowering either as PALIGNR or
8476 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8477 /// This matches shuffle vectors that look like:
8479 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8481 /// Essentially it concatenates V1 and V2, shifts right by some number of
8482 /// elements, and takes the low elements as the result. Note that while this is
8483 /// specified as a *right shift* because x86 is little-endian, it is a *left
8484 /// rotate* of the vector lanes.
8485 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8486 ArrayRef<int> Mask) {
8487 // Don't accept any shuffles with zero elements.
8488 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8491 // PALIGNR works on 128-bit lanes.
8492 SmallVector<int, 16> RepeatedMask;
8493 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8496 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8500 // PALIGNR rotates bytes, so we need to scale the
8501 // rotation based on how many bytes are in the vector lane.
8502 int NumElts = RepeatedMask.size();
8503 int Scale = 16 / NumElts;
8504 return Rotation * Scale;
8507 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8508 SDValue V1, SDValue V2,
8510 const X86Subtarget &Subtarget,
8511 SelectionDAG &DAG) {
8512 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8514 SDValue Lo = V1, Hi = V2;
8515 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8516 if (ByteRotation <= 0)
8519 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8521 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8522 Lo = DAG.getBitcast(ByteVT, Lo);
8523 Hi = DAG.getBitcast(ByteVT, Hi);
8525 // SSSE3 targets can use the palignr instruction.
8526 if (Subtarget.hasSSSE3()) {
8527 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8528 "512-bit PALIGNR requires BWI instructions");
8529 return DAG.getBitcast(
8530 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8531 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8534 assert(VT.is128BitVector() &&
8535 "Rotate-based lowering only supports 128-bit lowering!");
8536 assert(Mask.size() <= 16 &&
8537 "Can shuffle at most 16 bytes in a 128-bit vector!");
8538 assert(ByteVT == MVT::v16i8 &&
8539 "SSE2 rotate lowering only needed for v16i8!");
8541 // Default SSE2 implementation
8542 int LoByteShift = 16 - ByteRotation;
8543 int HiByteShift = ByteRotation;
8545 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
8546 DAG.getConstant(LoByteShift, DL, MVT::i8));
8547 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
8548 DAG.getConstant(HiByteShift, DL, MVT::i8));
8549 return DAG.getBitcast(VT,
8550 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
8553 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
8555 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
8556 /// rotation of the concatenation of two vectors; This routine will
8557 /// try to generically lower a vector shuffle through such an pattern.
8559 /// Essentially it concatenates V1 and V2, shifts right by some number of
8560 /// elements, and takes the low elements as the result. Note that while this is
8561 /// specified as a *right shift* because x86 is little-endian, it is a *left
8562 /// rotate* of the vector lanes.
8563 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
8564 SDValue V1, SDValue V2,
8566 const X86Subtarget &Subtarget,
8567 SelectionDAG &DAG) {
8568 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
8569 "Only 32-bit and 64-bit elements are supported!");
8571 // 128/256-bit vectors are only supported with VLX.
8572 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
8573 && "VLX required for 128/256-bit vectors");
8575 SDValue Lo = V1, Hi = V2;
8576 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
8580 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
8581 DAG.getConstant(Rotation, DL, MVT::i8));
8584 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
8586 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
8587 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
8588 /// matches elements from one of the input vectors shuffled to the left or
8589 /// right with zeroable elements 'shifted in'. It handles both the strictly
8590 /// bit-wise element shifts and the byte shift across an entire 128-bit double
8593 /// PSHL : (little-endian) left bit shift.
8594 /// [ zz, 0, zz, 2 ]
8595 /// [ -1, 4, zz, -1 ]
8596 /// PSRL : (little-endian) right bit shift.
8598 /// [ -1, -1, 7, zz]
8599 /// PSLLDQ : (little-endian) left byte shift
8600 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
8601 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
8602 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
8603 /// PSRLDQ : (little-endian) right byte shift
8604 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
8605 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
8606 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
8607 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
8608 unsigned ScalarSizeInBits,
8609 ArrayRef<int> Mask, int MaskOffset,
8610 const SmallBitVector &Zeroable,
8611 const X86Subtarget &Subtarget) {
8612 int Size = Mask.size();
8613 unsigned SizeInBits = Size * ScalarSizeInBits;
8615 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
8616 for (int i = 0; i < Size; i += Scale)
8617 for (int j = 0; j < Shift; ++j)
8618 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
8624 auto MatchShift = [&](int Shift, int Scale, bool Left) {
8625 for (int i = 0; i != Size; i += Scale) {
8626 unsigned Pos = Left ? i + Shift : i;
8627 unsigned Low = Left ? i : i + Shift;
8628 unsigned Len = Scale - Shift;
8629 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
8633 int ShiftEltBits = ScalarSizeInBits * Scale;
8634 bool ByteShift = ShiftEltBits > 64;
8635 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
8636 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
8637 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
8639 // Normalize the scale for byte shifts to still produce an i64 element
8641 Scale = ByteShift ? Scale / 2 : Scale;
8643 // We need to round trip through the appropriate type for the shift.
8644 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
8645 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
8646 : MVT::getVectorVT(ShiftSVT, Size / Scale);
8647 return (int)ShiftAmt;
8650 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
8651 // keep doubling the size of the integer elements up to that. We can
8652 // then shift the elements of the integer vector by whole multiples of
8653 // their width within the elements of the larger integer vector. Test each
8654 // multiple to see if we can find a match with the moved element indices
8655 // and that the shifted in elements are all zeroable.
8656 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
8657 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
8658 for (int Shift = 1; Shift != Scale; ++Shift)
8659 for (bool Left : {true, false})
8660 if (CheckZeros(Shift, Scale, Left)) {
8661 int ShiftAmt = MatchShift(Shift, Scale, Left);
8670 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
8671 SDValue V2, ArrayRef<int> Mask,
8672 const SmallBitVector &Zeroable,
8673 const X86Subtarget &Subtarget,
8674 SelectionDAG &DAG) {
8675 int Size = Mask.size();
8676 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8682 // Try to match shuffle against V1 shift.
8683 int ShiftAmt = matchVectorShuffleAsShift(
8684 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
8686 // If V1 failed, try to match shuffle against V2 shift.
8689 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
8690 Mask, Size, Zeroable, Subtarget);
8697 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
8698 "Illegal integer vector type");
8699 V = DAG.getBitcast(ShiftVT, V);
8700 V = DAG.getNode(Opcode, DL, ShiftVT, V,
8701 DAG.getConstant(ShiftAmt, DL, MVT::i8));
8702 return DAG.getBitcast(VT, V);
8705 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
8706 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
8707 SDValue V2, ArrayRef<int> Mask,
8708 const SmallBitVector &Zeroable,
8709 SelectionDAG &DAG) {
8710 int Size = Mask.size();
8711 int HalfSize = Size / 2;
8712 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8713 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
8715 // Upper half must be undefined.
8716 if (!isUndefInRange(Mask, HalfSize, HalfSize))
8719 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
8720 // Remainder of lower half result is zero and upper half is all undef.
8721 auto LowerAsEXTRQ = [&]() {
8722 // Determine the extraction length from the part of the
8723 // lower half that isn't zeroable.
8725 for (; Len > 0; --Len)
8726 if (!Zeroable[Len - 1])
8728 assert(Len > 0 && "Zeroable shuffle mask");
8730 // Attempt to match first Len sequential elements from the lower half.
8733 for (int i = 0; i != Len; ++i) {
8737 SDValue &V = (M < Size ? V1 : V2);
8740 // The extracted elements must start at a valid index and all mask
8741 // elements must be in the lower half.
8742 if (i > M || M >= HalfSize)
8745 if (Idx < 0 || (Src == V && Idx == (M - i))) {
8756 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
8757 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8758 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8759 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
8760 DAG.getConstant(BitLen, DL, MVT::i8),
8761 DAG.getConstant(BitIdx, DL, MVT::i8));
8764 if (SDValue ExtrQ = LowerAsEXTRQ())
8767 // INSERTQ: Extract lowest Len elements from lower half of second source and
8768 // insert over first source, starting at Idx.
8769 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
8770 auto LowerAsInsertQ = [&]() {
8771 for (int Idx = 0; Idx != HalfSize; ++Idx) {
8774 // Attempt to match first source from mask before insertion point.
8775 if (isUndefInRange(Mask, 0, Idx)) {
8777 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
8779 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
8785 // Extend the extraction length looking to match both the insertion of
8786 // the second source and the remaining elements of the first.
8787 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8792 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8794 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8800 // Match the remaining elements of the lower half.
8801 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8803 } else if ((!Base || (Base == V1)) &&
8804 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8806 } else if ((!Base || (Base == V2)) &&
8807 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8814 // We may not have a base (first source) - this can safely be undefined.
8816 Base = DAG.getUNDEF(VT);
8818 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8819 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8820 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8821 DAG.getConstant(BitLen, DL, MVT::i8),
8822 DAG.getConstant(BitIdx, DL, MVT::i8));
8829 if (SDValue InsertQ = LowerAsInsertQ())
8835 /// \brief Lower a vector shuffle as a zero or any extension.
8837 /// Given a specific number of elements, element bit width, and extension
8838 /// stride, produce either a zero or any extension based on the available
8839 /// features of the subtarget. The extended elements are consecutive and
8840 /// begin and can start from an offseted element index in the input; to
8841 /// avoid excess shuffling the offset must either being in the bottom lane
8842 /// or at the start of a higher lane. All extended elements must be from
8844 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8845 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8846 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8847 assert(Scale > 1 && "Need a scale to extend.");
8848 int EltBits = VT.getScalarSizeInBits();
8849 int NumElements = VT.getVectorNumElements();
8850 int NumEltsPerLane = 128 / EltBits;
8851 int OffsetLane = Offset / NumEltsPerLane;
8852 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8853 "Only 8, 16, and 32 bit elements can be extended.");
8854 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8855 assert(0 <= Offset && "Extension offset must be positive.");
8856 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8857 "Extension offset must be in the first lane or start an upper lane.");
8859 // Check that an index is in same lane as the base offset.
8860 auto SafeOffset = [&](int Idx) {
8861 return OffsetLane == (Idx / NumEltsPerLane);
8864 // Shift along an input so that the offset base moves to the first element.
8865 auto ShuffleOffset = [&](SDValue V) {
8869 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8870 for (int i = 0; i * Scale < NumElements; ++i) {
8871 int SrcIdx = i + Offset;
8872 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8874 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8877 // Found a valid zext mask! Try various lowering strategies based on the
8878 // input type and available ISA extensions.
8879 if (Subtarget.hasSSE41()) {
8880 // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8881 // PUNPCK will catch this in a later shuffle match.
8882 if (Offset && Scale == 2 && VT.is128BitVector())
8884 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8885 NumElements / Scale);
8886 InputV = ShuffleOffset(InputV);
8888 // For 256-bit vectors, we only need the lower (128-bit) input half.
8889 // For 512-bit vectors, we only need the lower input half or quarter.
8890 if (VT.getSizeInBits() > 128)
8891 InputV = extractSubVector(InputV, 0, DAG, DL,
8892 std::max(128, (int)VT.getSizeInBits() / Scale));
8894 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8895 return DAG.getBitcast(VT, InputV);
8898 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8900 // For any extends we can cheat for larger element sizes and use shuffle
8901 // instructions that can fold with a load and/or copy.
8902 if (AnyExt && EltBits == 32) {
8903 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8905 return DAG.getBitcast(
8906 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8907 DAG.getBitcast(MVT::v4i32, InputV),
8908 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8910 if (AnyExt && EltBits == 16 && Scale > 2) {
8911 int PSHUFDMask[4] = {Offset / 2, -1,
8912 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8913 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8914 DAG.getBitcast(MVT::v4i32, InputV),
8915 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8916 int PSHUFWMask[4] = {1, -1, -1, -1};
8917 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8918 return DAG.getBitcast(
8919 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8920 DAG.getBitcast(MVT::v8i16, InputV),
8921 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8924 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8926 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8927 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8928 assert(VT.is128BitVector() && "Unexpected vector width!");
8930 int LoIdx = Offset * EltBits;
8931 SDValue Lo = DAG.getBitcast(
8932 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8933 DAG.getConstant(EltBits, DL, MVT::i8),
8934 DAG.getConstant(LoIdx, DL, MVT::i8)));
8936 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8937 !SafeOffset(Offset + 1))
8938 return DAG.getBitcast(VT, Lo);
8940 int HiIdx = (Offset + 1) * EltBits;
8941 SDValue Hi = DAG.getBitcast(
8942 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8943 DAG.getConstant(EltBits, DL, MVT::i8),
8944 DAG.getConstant(HiIdx, DL, MVT::i8)));
8945 return DAG.getBitcast(VT,
8946 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8949 // If this would require more than 2 unpack instructions to expand, use
8950 // pshufb when available. We can only use more than 2 unpack instructions
8951 // when zero extending i8 elements which also makes it easier to use pshufb.
8952 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8953 assert(NumElements == 16 && "Unexpected byte vector width!");
8954 SDValue PSHUFBMask[16];
8955 for (int i = 0; i < 16; ++i) {
8956 int Idx = Offset + (i / Scale);
8957 PSHUFBMask[i] = DAG.getConstant(
8958 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8960 InputV = DAG.getBitcast(MVT::v16i8, InputV);
8961 return DAG.getBitcast(
8962 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8963 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8966 // If we are extending from an offset, ensure we start on a boundary that
8967 // we can unpack from.
8968 int AlignToUnpack = Offset % (NumElements / Scale);
8969 if (AlignToUnpack) {
8970 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8971 for (int i = AlignToUnpack; i < NumElements; ++i)
8972 ShMask[i - AlignToUnpack] = i;
8973 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8974 Offset -= AlignToUnpack;
8977 // Otherwise emit a sequence of unpacks.
8979 unsigned UnpackLoHi = X86ISD::UNPCKL;
8980 if (Offset >= (NumElements / 2)) {
8981 UnpackLoHi = X86ISD::UNPCKH;
8982 Offset -= (NumElements / 2);
8985 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8986 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8987 : getZeroVector(InputVT, Subtarget, DAG, DL);
8988 InputV = DAG.getBitcast(InputVT, InputV);
8989 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8993 } while (Scale > 1);
8994 return DAG.getBitcast(VT, InputV);
8997 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8999 /// This routine will try to do everything in its power to cleverly lower
9000 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9001 /// check for the profitability of this lowering, it tries to aggressively
9002 /// match this pattern. It will use all of the micro-architectural details it
9003 /// can to emit an efficient lowering. It handles both blends with all-zero
9004 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9005 /// masking out later).
9007 /// The reason we have dedicated lowering for zext-style shuffles is that they
9008 /// are both incredibly common and often quite performance sensitive.
9009 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9010 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9011 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9012 SelectionDAG &DAG) {
9013 int Bits = VT.getSizeInBits();
9014 int NumLanes = Bits / 128;
9015 int NumElements = VT.getVectorNumElements();
9016 int NumEltsPerLane = NumElements / NumLanes;
9017 assert(VT.getScalarSizeInBits() <= 32 &&
9018 "Exceeds 32-bit integer zero extension limit");
9019 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9021 // Define a helper function to check a particular ext-scale and lower to it if
9023 auto Lower = [&](int Scale) -> SDValue {
9028 for (int i = 0; i < NumElements; ++i) {
9031 continue; // Valid anywhere but doesn't tell us anything.
9032 if (i % Scale != 0) {
9033 // Each of the extended elements need to be zeroable.
9037 // We no longer are in the anyext case.
9042 // Each of the base elements needs to be consecutive indices into the
9043 // same input vector.
9044 SDValue V = M < NumElements ? V1 : V2;
9045 M = M % NumElements;
9048 Offset = M - (i / Scale);
9049 } else if (InputV != V)
9050 return SDValue(); // Flip-flopping inputs.
9052 // Offset must start in the lowest 128-bit lane or at the start of an
9054 // FIXME: Is it ever worth allowing a negative base offset?
9055 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9056 (Offset % NumEltsPerLane) == 0))
9059 // If we are offsetting, all referenced entries must come from the same
9061 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9064 if ((M % NumElements) != (Offset + (i / Scale)))
9065 return SDValue(); // Non-consecutive strided elements.
9069 // If we fail to find an input, we have a zero-shuffle which should always
9070 // have already been handled.
9071 // FIXME: Maybe handle this here in case during blending we end up with one?
9075 // If we are offsetting, don't extend if we only match a single input, we
9076 // can always do better by using a basic PSHUF or PUNPCK.
9077 if (Offset != 0 && Matches < 2)
9080 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9081 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9084 // The widest scale possible for extending is to a 64-bit integer.
9085 assert(Bits % 64 == 0 &&
9086 "The number of bits in a vector must be divisible by 64 on x86!");
9087 int NumExtElements = Bits / 64;
9089 // Each iteration, try extending the elements half as much, but into twice as
9091 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9092 assert(NumElements % NumExtElements == 0 &&
9093 "The input vector size must be divisible by the extended size.");
9094 if (SDValue V = Lower(NumElements / NumExtElements))
9098 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9102 // Returns one of the source operands if the shuffle can be reduced to a
9103 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9104 auto CanZExtLowHalf = [&]() {
9105 for (int i = NumElements / 2; i != NumElements; ++i)
9108 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9110 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9115 if (SDValue V = CanZExtLowHalf()) {
9116 V = DAG.getBitcast(MVT::v2i64, V);
9117 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9118 return DAG.getBitcast(VT, V);
9121 // No viable ext lowering found.
9125 /// \brief Try to get a scalar value for a specific element of a vector.
9127 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9128 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9129 SelectionDAG &DAG) {
9130 MVT VT = V.getSimpleValueType();
9131 MVT EltVT = VT.getVectorElementType();
9132 V = peekThroughBitcasts(V);
9134 // If the bitcasts shift the element size, we can't extract an equivalent
9136 MVT NewVT = V.getSimpleValueType();
9137 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9140 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9141 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9142 // Ensure the scalar operand is the same size as the destination.
9143 // FIXME: Add support for scalar truncation where possible.
9144 SDValue S = V.getOperand(Idx);
9145 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9146 return DAG.getBitcast(EltVT, S);
9152 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9154 /// This is particularly important because the set of instructions varies
9155 /// significantly based on whether the operand is a load or not.
9156 static bool isShuffleFoldableLoad(SDValue V) {
9157 V = peekThroughBitcasts(V);
9158 return ISD::isNON_EXTLoad(V.getNode());
9161 /// \brief Try to lower insertion of a single element into a zero vector.
9163 /// This is a common pattern that we have especially efficient patterns to lower
9164 /// across all subtarget feature sets.
9165 static SDValue lowerVectorShuffleAsElementInsertion(
9166 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9167 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9168 SelectionDAG &DAG) {
9170 MVT EltVT = VT.getVectorElementType();
9173 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9175 bool IsV1Zeroable = true;
9176 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9177 if (i != V2Index && !Zeroable[i]) {
9178 IsV1Zeroable = false;
9182 // Check for a single input from a SCALAR_TO_VECTOR node.
9183 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9184 // all the smarts here sunk into that routine. However, the current
9185 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9186 // vector shuffle lowering is dead.
9187 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9189 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9190 // We need to zext the scalar if it is smaller than an i32.
9191 V2S = DAG.getBitcast(EltVT, V2S);
9192 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9193 // Using zext to expand a narrow element won't work for non-zero
9198 // Zero-extend directly to i32.
9200 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9202 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9203 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9204 EltVT == MVT::i16) {
9205 // Either not inserting from the low element of the input or the input
9206 // element size is too small to use VZEXT_MOVL to clear the high bits.
9210 if (!IsV1Zeroable) {
9211 // If V1 can't be treated as a zero vector we have fewer options to lower
9212 // this. We can't support integer vectors or non-zero targets cheaply, and
9213 // the V1 elements can't be permuted in any way.
9214 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9215 if (!VT.isFloatingPoint() || V2Index != 0)
9217 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9218 V1Mask[V2Index] = -1;
9219 if (!isNoopShuffleMask(V1Mask))
9221 // This is essentially a special case blend operation, but if we have
9222 // general purpose blend operations, they are always faster. Bail and let
9223 // the rest of the lowering handle these as blends.
9224 if (Subtarget.hasSSE41())
9227 // Otherwise, use MOVSD or MOVSS.
9228 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9229 "Only two types of floating point element types to handle!");
9230 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9234 // This lowering only works for the low element with floating point vectors.
9235 if (VT.isFloatingPoint() && V2Index != 0)
9238 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9240 V2 = DAG.getBitcast(VT, V2);
9243 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9244 // the desired position. Otherwise it is more efficient to do a vector
9245 // shift left. We know that we can do a vector shift left because all
9246 // the inputs are zero.
9247 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9248 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9249 V2Shuffle[V2Index] = 0;
9250 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9252 V2 = DAG.getBitcast(MVT::v16i8, V2);
9254 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9255 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9256 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9257 DAG.getDataLayout(), VT)));
9258 V2 = DAG.getBitcast(VT, V2);
9264 /// Try to lower broadcast of a single - truncated - integer element,
9265 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9267 /// This assumes we have AVX2.
9268 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9269 SDValue V0, int BroadcastIdx,
9270 const X86Subtarget &Subtarget,
9271 SelectionDAG &DAG) {
9272 assert(Subtarget.hasAVX2() &&
9273 "We can only lower integer broadcasts with AVX2!");
9275 EVT EltVT = VT.getVectorElementType();
9276 EVT V0VT = V0.getValueType();
9278 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9279 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9281 EVT V0EltVT = V0VT.getVectorElementType();
9282 if (!V0EltVT.isInteger())
9285 const unsigned EltSize = EltVT.getSizeInBits();
9286 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9288 // This is only a truncation if the original element type is larger.
9289 if (V0EltSize <= EltSize)
9292 assert(((V0EltSize % EltSize) == 0) &&
9293 "Scalar type sizes must all be powers of 2 on x86!");
9295 const unsigned V0Opc = V0.getOpcode();
9296 const unsigned Scale = V0EltSize / EltSize;
9297 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9299 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9300 V0Opc != ISD::BUILD_VECTOR)
9303 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9305 // If we're extracting non-least-significant bits, shift so we can truncate.
9306 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9307 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9308 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9309 if (const int OffsetIdx = BroadcastIdx % Scale)
9310 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9311 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9313 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9314 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9317 /// \brief Try to lower broadcast of a single element.
9319 /// For convenience, this code also bundles all of the subtarget feature set
9320 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9321 /// a convenient way to factor it out.
9322 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
9323 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9324 SDValue V1, SDValue V2,
9326 const X86Subtarget &Subtarget,
9327 SelectionDAG &DAG) {
9328 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9329 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9330 (Subtarget.hasAVX2() && VT.isInteger())))
9333 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9334 // we can only broadcast from a register with AVX2.
9335 unsigned NumElts = Mask.size();
9336 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9337 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9339 // Check that the mask is a broadcast.
9340 int BroadcastIdx = -1;
9341 for (int i = 0; i != (int)NumElts; ++i) {
9342 SmallVector<int, 8> BroadcastMask(NumElts, i);
9343 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9349 if (BroadcastIdx < 0)
9351 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9352 "a sorted mask where the broadcast "
9355 // Go up the chain of (vector) values to find a scalar load that we can
9356 // combine with the broadcast.
9359 switch (V.getOpcode()) {
9360 case ISD::BITCAST: {
9361 SDValue VSrc = V.getOperand(0);
9362 MVT SrcVT = VSrc.getSimpleValueType();
9363 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9368 case ISD::CONCAT_VECTORS: {
9369 int OperandSize = Mask.size() / V.getNumOperands();
9370 V = V.getOperand(BroadcastIdx / OperandSize);
9371 BroadcastIdx %= OperandSize;
9374 case ISD::INSERT_SUBVECTOR: {
9375 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9376 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9380 int BeginIdx = (int)ConstantIdx->getZExtValue();
9382 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9383 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9384 BroadcastIdx -= BeginIdx;
9395 // Check if this is a broadcast of a scalar. We special case lowering
9396 // for scalars so that we can more effectively fold with loads.
9397 // First, look through bitcast: if the original value has a larger element
9398 // type than the shuffle, the broadcast element is in essence truncated.
9399 // Make that explicit to ease folding.
9400 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9401 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9402 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9403 return TruncBroadcast;
9405 MVT BroadcastVT = VT;
9407 // Peek through any bitcast (only useful for loads).
9408 SDValue BC = peekThroughBitcasts(V);
9410 // Also check the simpler case, where we can directly reuse the scalar.
9411 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9412 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9413 V = V.getOperand(BroadcastIdx);
9415 // If we can't broadcast from a register, check that the input is a load.
9416 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9418 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9419 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9420 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9421 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9422 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9425 // If we are broadcasting a load that is only used by the shuffle
9426 // then we can reduce the vector load to the broadcasted scalar load.
9427 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9428 SDValue BaseAddr = Ld->getOperand(1);
9429 EVT SVT = BroadcastVT.getScalarType();
9430 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9431 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9432 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9433 DAG.getMachineFunction().getMachineMemOperand(
9434 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9436 // Make sure the newly-created LOAD is in the same position as Ld in
9437 // terms of dependency. We create a TokenFactor for Ld and V,
9438 // and update uses of Ld's output chain to use the TokenFactor.
9439 if (Ld->hasAnyUseOfValue(1)) {
9440 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9441 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9442 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9443 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9444 SDValue(V.getNode(), 1));
9446 } else if (!BroadcastFromReg) {
9447 // We can't broadcast from a vector register.
9449 } else if (BroadcastIdx != 0) {
9450 // We can only broadcast from the zero-element of a vector register,
9451 // but it can be advantageous to broadcast from the zero-element of a
9453 if (!VT.is256BitVector() && !VT.is512BitVector())
9456 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9457 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9460 // Only broadcast the zero-element of a 128-bit subvector.
9461 unsigned EltSize = VT.getScalarSizeInBits();
9462 if (((BroadcastIdx * EltSize) % 128) != 0)
9465 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
9466 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9467 DAG.getIntPtrConstant(BroadcastIdx, DL));
9470 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9471 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9472 DAG.getBitcast(MVT::f64, V));
9474 // Bitcast back to the same scalar type as BroadcastVT.
9475 MVT SrcVT = V.getSimpleValueType();
9476 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9477 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9478 "Unexpected vector element size");
9479 if (SrcVT.isVector()) {
9480 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9481 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9483 SrcVT = BroadcastVT.getScalarType();
9485 V = DAG.getBitcast(SrcVT, V);
9488 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9489 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9490 V = DAG.getBitcast(MVT::f64, V);
9491 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9492 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9495 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9498 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9499 // INSERTPS when the V1 elements are already in the correct locations
9500 // because otherwise we can just always use two SHUFPS instructions which
9501 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9502 // perform INSERTPS if a single V1 element is out of place and all V2
9503 // elements are zeroable.
9504 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9505 unsigned &InsertPSMask,
9506 const SmallBitVector &Zeroable,
9508 SelectionDAG &DAG) {
9509 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9510 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9511 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9513 // Attempt to match INSERTPS with one element from VA or VB being
9514 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9516 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9517 ArrayRef<int> CandidateMask) {
9519 int VADstIndex = -1;
9520 int VBDstIndex = -1;
9521 bool VAUsedInPlace = false;
9523 for (int i = 0; i < 4; ++i) {
9524 // Synthesize a zero mask from the zeroable elements (includes undefs).
9530 // Flag if we use any VA inputs in place.
9531 if (i == CandidateMask[i]) {
9532 VAUsedInPlace = true;
9536 // We can only insert a single non-zeroable element.
9537 if (VADstIndex >= 0 || VBDstIndex >= 0)
9540 if (CandidateMask[i] < 4) {
9541 // VA input out of place for insertion.
9544 // VB input for insertion.
9549 // Don't bother if we have no (non-zeroable) element for insertion.
9550 if (VADstIndex < 0 && VBDstIndex < 0)
9553 // Determine element insertion src/dst indices. The src index is from the
9554 // start of the inserted vector, not the start of the concatenated vector.
9555 unsigned VBSrcIndex = 0;
9556 if (VADstIndex >= 0) {
9557 // If we have a VA input out of place, we use VA as the V2 element
9558 // insertion and don't use the original V2 at all.
9559 VBSrcIndex = CandidateMask[VADstIndex];
9560 VBDstIndex = VADstIndex;
9563 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
9566 // If no V1 inputs are used in place, then the result is created only from
9567 // the zero mask and the V2 insertion - so remove V1 dependency.
9569 VA = DAG.getUNDEF(MVT::v4f32);
9571 // Update V1, V2 and InsertPSMask accordingly.
9575 // Insert the V2 element into the desired position.
9576 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
9577 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
9581 if (matchAsInsertPS(V1, V2, Mask))
9584 // Commute and try again.
9585 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
9586 ShuffleVectorSDNode::commuteMask(CommutedMask);
9587 if (matchAsInsertPS(V2, V1, CommutedMask))
9593 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
9594 SDValue V2, ArrayRef<int> Mask,
9595 const SmallBitVector &Zeroable,
9596 SelectionDAG &DAG) {
9597 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9598 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9600 // Attempt to match the insertps pattern.
9601 unsigned InsertPSMask;
9602 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
9605 // Insert the V2 element into the desired position.
9606 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9607 DAG.getConstant(InsertPSMask, DL, MVT::i8));
9610 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
9611 /// UNPCK instruction.
9613 /// This specifically targets cases where we end up with alternating between
9614 /// the two inputs, and so can permute them into something that feeds a single
9615 /// UNPCK instruction. Note that this routine only targets integer vectors
9616 /// because for floating point vectors we have a generalized SHUFPS lowering
9617 /// strategy that handles everything that doesn't *exactly* match an unpack,
9618 /// making this clever lowering unnecessary.
9619 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
9620 SDValue V1, SDValue V2,
9622 SelectionDAG &DAG) {
9623 assert(!VT.isFloatingPoint() &&
9624 "This routine only supports integer vectors.");
9625 assert(VT.is128BitVector() &&
9626 "This routine only works on 128-bit vectors.");
9627 assert(!V2.isUndef() &&
9628 "This routine should only be used when blending two inputs.");
9629 assert(Mask.size() >= 2 && "Single element masks are invalid.");
9631 int Size = Mask.size();
9634 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
9636 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
9638 bool UnpackLo = NumLoInputs >= NumHiInputs;
9640 auto TryUnpack = [&](int ScalarSize, int Scale) {
9641 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
9642 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
9644 for (int i = 0; i < Size; ++i) {
9648 // Each element of the unpack contains Scale elements from this mask.
9649 int UnpackIdx = i / Scale;
9651 // We only handle the case where V1 feeds the first slots of the unpack.
9652 // We rely on canonicalization to ensure this is the case.
9653 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
9656 // Setup the mask for this input. The indexing is tricky as we have to
9657 // handle the unpack stride.
9658 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
9659 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
9663 // If we will have to shuffle both inputs to use the unpack, check whether
9664 // we can just unpack first and shuffle the result. If so, skip this unpack.
9665 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
9666 !isNoopShuffleMask(V2Mask))
9669 // Shuffle the inputs into place.
9670 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9671 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9673 // Cast the inputs to the type we will use to unpack them.
9674 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
9675 V1 = DAG.getBitcast(UnpackVT, V1);
9676 V2 = DAG.getBitcast(UnpackVT, V2);
9678 // Unpack the inputs and cast the result back to the desired type.
9679 return DAG.getBitcast(
9680 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9684 // We try each unpack from the largest to the smallest to try and find one
9685 // that fits this mask.
9686 int OrigScalarSize = VT.getScalarSizeInBits();
9687 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
9688 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
9691 // If none of the unpack-rooted lowerings worked (or were profitable) try an
9693 if (NumLoInputs == 0 || NumHiInputs == 0) {
9694 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
9695 "We have to have *some* inputs!");
9696 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
9698 // FIXME: We could consider the total complexity of the permute of each
9699 // possible unpacking. Or at the least we should consider how many
9700 // half-crossings are created.
9701 // FIXME: We could consider commuting the unpacks.
9703 SmallVector<int, 32> PermMask((unsigned)Size, -1);
9704 for (int i = 0; i < Size; ++i) {
9708 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
9711 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
9713 return DAG.getVectorShuffle(
9714 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
9716 DAG.getUNDEF(VT), PermMask);
9722 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
9724 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
9725 /// support for floating point shuffles but not integer shuffles. These
9726 /// instructions will incur a domain crossing penalty on some chips though so
9727 /// it is better to avoid lowering through this for integer vectors where
9729 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9730 const SmallBitVector &Zeroable,
9731 SDValue V1, SDValue V2,
9732 const X86Subtarget &Subtarget,
9733 SelectionDAG &DAG) {
9734 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9735 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9736 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9739 // Check for being able to broadcast a single element.
9740 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9741 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
9744 // Straight shuffle of a single input vector. Simulate this by using the
9745 // single input as both of the "inputs" to this instruction..
9746 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
9748 if (Subtarget.hasAVX()) {
9749 // If we have AVX, we can use VPERMILPS which will allow folding a load
9750 // into the shuffle.
9751 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
9752 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9756 X86ISD::SHUFP, DL, MVT::v2f64,
9757 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9758 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9759 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9761 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
9762 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
9764 // If we have a single input, insert that into V1 if we can do so cheaply.
9765 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
9766 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9767 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9769 // Try inverting the insertion since for v2 masks it is easy to do and we
9770 // can't reliably sort the mask one way or the other.
9771 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
9772 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
9773 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9774 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9778 // Try to use one of the special instruction patterns to handle two common
9779 // blend patterns if a zero-blend above didn't work.
9780 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
9781 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
9782 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
9783 // We can either use a special instruction to load over the low double or
9784 // to move just the low double.
9786 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
9788 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
9790 if (Subtarget.hasSSE41())
9791 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
9792 Zeroable, Subtarget, DAG))
9795 // Use dedicated unpack instructions for masks that match their pattern.
9797 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
9800 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
9801 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
9802 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9805 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
9807 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
9808 /// the integer unit to minimize domain crossing penalties. However, for blends
9809 /// it falls back to the floating point shuffle operation with appropriate bit
9811 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9812 const SmallBitVector &Zeroable,
9813 SDValue V1, SDValue V2,
9814 const X86Subtarget &Subtarget,
9815 SelectionDAG &DAG) {
9816 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9817 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9818 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9821 // Check for being able to broadcast a single element.
9822 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9823 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9826 // Straight shuffle of a single input vector. For everything from SSE2
9827 // onward this has a single fast instruction with no scary immediates.
9828 // We have to map the mask as it is actually a v4i32 shuffle instruction.
9829 V1 = DAG.getBitcast(MVT::v4i32, V1);
9830 int WidenedMask[4] = {
9831 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9832 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9833 return DAG.getBitcast(
9835 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9836 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9838 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9839 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9840 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9841 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9843 // If we have a blend of two same-type PACKUS operations and the blend aligns
9844 // with the low and high halves, we can just merge the PACKUS operations.
9845 // This is particularly important as it lets us merge shuffles that this
9846 // routine itself creates.
9847 auto GetPackNode = [](SDValue V) {
9848 V = peekThroughBitcasts(V);
9849 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9851 if (SDValue V1Pack = GetPackNode(V1))
9852 if (SDValue V2Pack = GetPackNode(V2)) {
9853 EVT PackVT = V1Pack.getValueType();
9854 if (PackVT == V2Pack.getValueType())
9855 return DAG.getBitcast(MVT::v2i64,
9856 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9857 Mask[0] == 0 ? V1Pack.getOperand(0)
9858 : V1Pack.getOperand(1),
9859 Mask[1] == 2 ? V2Pack.getOperand(0)
9860 : V2Pack.getOperand(1)));
9863 // Try to use shift instructions.
9864 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9865 Zeroable, Subtarget, DAG))
9868 // When loading a scalar and then shuffling it into a vector we can often do
9869 // the insertion cheaply.
9870 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9871 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9873 // Try inverting the insertion since for v2 masks it is easy to do and we
9874 // can't reliably sort the mask one way or the other.
9875 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9876 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9877 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9880 // We have different paths for blend lowering, but they all must use the
9881 // *exact* same predicate.
9882 bool IsBlendSupported = Subtarget.hasSSE41();
9883 if (IsBlendSupported)
9884 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9885 Zeroable, Subtarget, DAG))
9888 // Use dedicated unpack instructions for masks that match their pattern.
9890 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9893 // Try to use byte rotation instructions.
9894 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9895 if (Subtarget.hasSSSE3())
9896 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9897 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9900 // If we have direct support for blends, we should lower by decomposing into
9901 // a permute. That will be faster than the domain cross.
9902 if (IsBlendSupported)
9903 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9906 // We implement this with SHUFPD which is pretty lame because it will likely
9907 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9908 // However, all the alternatives are still more cycles and newer chips don't
9909 // have this problem. It would be really nice if x86 had better shuffles here.
9910 V1 = DAG.getBitcast(MVT::v2f64, V1);
9911 V2 = DAG.getBitcast(MVT::v2f64, V2);
9912 return DAG.getBitcast(MVT::v2i64,
9913 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9916 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9918 /// This is used to disable more specialized lowerings when the shufps lowering
9919 /// will happen to be efficient.
9920 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9921 // This routine only handles 128-bit shufps.
9922 assert(Mask.size() == 4 && "Unsupported mask size!");
9923 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9924 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9925 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9926 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9928 // To lower with a single SHUFPS we need to have the low half and high half
9929 // each requiring a single input.
9930 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9932 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9938 /// \brief Lower a vector shuffle using the SHUFPS instruction.
9940 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9941 /// It makes no assumptions about whether this is the *best* lowering, it simply
9943 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9944 ArrayRef<int> Mask, SDValue V1,
9945 SDValue V2, SelectionDAG &DAG) {
9946 SDValue LowV = V1, HighV = V2;
9947 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9949 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9951 if (NumV2Elements == 1) {
9952 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
9954 // Compute the index adjacent to V2Index and in the same half by toggling
9956 int V2AdjIndex = V2Index ^ 1;
9958 if (Mask[V2AdjIndex] < 0) {
9959 // Handles all the cases where we have a single V2 element and an undef.
9960 // This will only ever happen in the high lanes because we commute the
9961 // vector otherwise.
9963 std::swap(LowV, HighV);
9964 NewMask[V2Index] -= 4;
9966 // Handle the case where the V2 element ends up adjacent to a V1 element.
9967 // To make this work, blend them together as the first step.
9968 int V1Index = V2AdjIndex;
9969 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9970 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9971 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9973 // Now proceed to reconstruct the final blend as we have the necessary
9974 // high or low half formed.
9981 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9982 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9984 } else if (NumV2Elements == 2) {
9985 if (Mask[0] < 4 && Mask[1] < 4) {
9986 // Handle the easy case where we have V1 in the low lanes and V2 in the
9990 } else if (Mask[2] < 4 && Mask[3] < 4) {
9991 // We also handle the reversed case because this utility may get called
9992 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9993 // arrange things in the right direction.
9999 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10000 // trying to place elements directly, just blend them and set up the final
10001 // shuffle to place them.
10003 // The first two blend mask elements are for V1, the second two are for
10005 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10006 Mask[2] < 4 ? Mask[2] : Mask[3],
10007 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10008 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10009 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10010 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10012 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10015 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10016 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10017 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10018 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10021 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10022 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10025 /// \brief Lower 4-lane 32-bit floating point shuffles.
10027 /// Uses instructions exclusively from the floating point unit to minimize
10028 /// domain crossing penalties, as these are sufficient to implement all v4f32
10030 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10031 const SmallBitVector &Zeroable,
10032 SDValue V1, SDValue V2,
10033 const X86Subtarget &Subtarget,
10034 SelectionDAG &DAG) {
10035 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10036 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10037 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10039 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10041 if (NumV2Elements == 0) {
10042 // Check for being able to broadcast a single element.
10043 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10044 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10047 // Use even/odd duplicate instructions for masks that match their pattern.
10048 if (Subtarget.hasSSE3()) {
10049 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10050 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10051 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10052 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10055 if (Subtarget.hasAVX()) {
10056 // If we have AVX, we can use VPERMILPS which will allow folding a load
10057 // into the shuffle.
10058 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10059 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10062 // Otherwise, use a straight shuffle of a single input vector. We pass the
10063 // input vector to both operands to simulate this with a SHUFPS.
10064 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10065 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10068 // There are special ways we can lower some single-element blends. However, we
10069 // have custom ways we can lower more complex single-element blends below that
10070 // we defer to if both this and BLENDPS fail to match, so restrict this to
10071 // when the V2 input is targeting element 0 of the mask -- that is the fast
10073 if (NumV2Elements == 1 && Mask[0] >= 4)
10074 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10075 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10078 if (Subtarget.hasSSE41()) {
10079 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10080 Zeroable, Subtarget, DAG))
10083 // Use INSERTPS if we can complete the shuffle efficiently.
10085 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10088 if (!isSingleSHUFPSMask(Mask))
10089 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10090 DL, MVT::v4f32, V1, V2, Mask, DAG))
10094 // Use low/high mov instructions.
10095 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10096 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10097 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10098 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10100 // Use dedicated unpack instructions for masks that match their pattern.
10102 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10105 // Otherwise fall back to a SHUFPS lowering strategy.
10106 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10109 /// \brief Lower 4-lane i32 vector shuffles.
10111 /// We try to handle these with integer-domain shuffles where we can, but for
10112 /// blends we use the floating point domain blend instructions.
10113 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10114 const SmallBitVector &Zeroable,
10115 SDValue V1, SDValue V2,
10116 const X86Subtarget &Subtarget,
10117 SelectionDAG &DAG) {
10118 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10119 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10120 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10122 // Whenever we can lower this as a zext, that instruction is strictly faster
10123 // than any alternative. It also allows us to fold memory operands into the
10124 // shuffle in many cases.
10125 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10126 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10129 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10131 if (NumV2Elements == 0) {
10132 // Check for being able to broadcast a single element.
10133 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10134 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10137 // Straight shuffle of a single input vector. For everything from SSE2
10138 // onward this has a single fast instruction with no scary immediates.
10139 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10140 // but we aren't actually going to use the UNPCK instruction because doing
10141 // so prevents folding a load into this instruction or making a copy.
10142 const int UnpackLoMask[] = {0, 0, 1, 1};
10143 const int UnpackHiMask[] = {2, 2, 3, 3};
10144 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10145 Mask = UnpackLoMask;
10146 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10147 Mask = UnpackHiMask;
10149 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10150 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10153 // Try to use shift instructions.
10154 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10155 Zeroable, Subtarget, DAG))
10158 // There are special ways we can lower some single-element blends.
10159 if (NumV2Elements == 1)
10160 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10161 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10164 // We have different paths for blend lowering, but they all must use the
10165 // *exact* same predicate.
10166 bool IsBlendSupported = Subtarget.hasSSE41();
10167 if (IsBlendSupported)
10168 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10169 Zeroable, Subtarget, DAG))
10172 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10176 // Use dedicated unpack instructions for masks that match their pattern.
10178 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10181 // Try to use byte rotation instructions.
10182 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10183 if (Subtarget.hasSSSE3())
10184 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10185 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10188 // Assume that a single SHUFPS is faster than an alternative sequence of
10189 // multiple instructions (even if the CPU has a domain penalty).
10190 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10191 if (!isSingleSHUFPSMask(Mask)) {
10192 // If we have direct support for blends, we should lower by decomposing into
10193 // a permute. That will be faster than the domain cross.
10194 if (IsBlendSupported)
10195 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10198 // Try to lower by permuting the inputs into an unpack instruction.
10199 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10200 DL, MVT::v4i32, V1, V2, Mask, DAG))
10204 // We implement this with SHUFPS because it can blend from two vectors.
10205 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10206 // up the inputs, bypassing domain shift penalties that we would encur if we
10207 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10209 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10210 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10211 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10212 return DAG.getBitcast(MVT::v4i32, ShufPS);
10215 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10216 /// shuffle lowering, and the most complex part.
10218 /// The lowering strategy is to try to form pairs of input lanes which are
10219 /// targeted at the same half of the final vector, and then use a dword shuffle
10220 /// to place them onto the right half, and finally unpack the paired lanes into
10221 /// their final position.
10223 /// The exact breakdown of how to form these dword pairs and align them on the
10224 /// correct sides is really tricky. See the comments within the function for
10225 /// more of the details.
10227 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10228 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10229 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10230 /// vector, form the analogous 128-bit 8-element Mask.
10231 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10232 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10233 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10234 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10235 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10237 assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
10238 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10239 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10241 SmallVector<int, 4> LoInputs;
10242 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
10243 [](int M) { return M >= 0; });
10244 std::sort(LoInputs.begin(), LoInputs.end());
10245 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10246 SmallVector<int, 4> HiInputs;
10247 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
10248 [](int M) { return M >= 0; });
10249 std::sort(HiInputs.begin(), HiInputs.end());
10250 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10252 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10253 int NumHToL = LoInputs.size() - NumLToL;
10255 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10256 int NumHToH = HiInputs.size() - NumLToH;
10257 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10258 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10259 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10260 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10262 // If we are splatting two values from one half - one to each half, then
10263 // we can shuffle that half so each is splatted to a dword, then splat those
10264 // to their respective halves.
10265 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10267 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10268 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10269 V = DAG.getNode(ShufWOp, DL, VT, V,
10270 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10271 V = DAG.getBitcast(PSHUFDVT, V);
10272 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10273 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10274 return DAG.getBitcast(VT, V);
10277 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10278 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10279 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10280 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10282 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10283 // such inputs we can swap two of the dwords across the half mark and end up
10284 // with <=2 inputs to each half in each half. Once there, we can fall through
10285 // to the generic code below. For example:
10287 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10288 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10290 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10291 // and an existing 2-into-2 on the other half. In this case we may have to
10292 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10293 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10294 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10295 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10296 // half than the one we target for fixing) will be fixed when we re-enter this
10297 // path. We will also combine away any sequence of PSHUFD instructions that
10298 // result into a single instruction. Here is an example of the tricky case:
10300 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10301 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10303 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10305 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10306 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10308 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10309 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10311 // The result is fine to be handled by the generic logic.
10312 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10313 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10314 int AOffset, int BOffset) {
10315 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10316 "Must call this with A having 3 or 1 inputs from the A half.");
10317 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10318 "Must call this with B having 1 or 3 inputs from the B half.");
10319 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10320 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10322 bool ThreeAInputs = AToAInputs.size() == 3;
10324 // Compute the index of dword with only one word among the three inputs in
10325 // a half by taking the sum of the half with three inputs and subtracting
10326 // the sum of the actual three inputs. The difference is the remaining
10328 int ADWord, BDWord;
10329 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10330 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10331 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10332 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10333 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10334 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10335 int TripleNonInputIdx =
10336 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10337 TripleDWord = TripleNonInputIdx / 2;
10339 // We use xor with one to compute the adjacent DWord to whichever one the
10341 OneInputDWord = (OneInput / 2) ^ 1;
10343 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10344 // and BToA inputs. If there is also such a problem with the BToB and AToB
10345 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10346 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10347 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10348 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10349 // Compute how many inputs will be flipped by swapping these DWords. We
10351 // to balance this to ensure we don't form a 3-1 shuffle in the other
10353 int NumFlippedAToBInputs =
10354 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10355 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10356 int NumFlippedBToBInputs =
10357 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10358 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10359 if ((NumFlippedAToBInputs == 1 &&
10360 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10361 (NumFlippedBToBInputs == 1 &&
10362 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10363 // We choose whether to fix the A half or B half based on whether that
10364 // half has zero flipped inputs. At zero, we may not be able to fix it
10365 // with that half. We also bias towards fixing the B half because that
10366 // will more commonly be the high half, and we have to bias one way.
10367 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10368 ArrayRef<int> Inputs) {
10369 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10370 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10371 // Determine whether the free index is in the flipped dword or the
10372 // unflipped dword based on where the pinned index is. We use this bit
10373 // in an xor to conditionally select the adjacent dword.
10374 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10375 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10376 if (IsFixIdxInput == IsFixFreeIdxInput)
10378 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10379 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10380 "We need to be changing the number of flipped inputs!");
10381 int PSHUFHalfMask[] = {0, 1, 2, 3};
10382 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10383 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10385 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10387 for (int &M : Mask)
10388 if (M >= 0 && M == FixIdx)
10390 else if (M >= 0 && M == FixFreeIdx)
10393 if (NumFlippedBToBInputs != 0) {
10395 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10396 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10398 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10399 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10400 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10405 int PSHUFDMask[] = {0, 1, 2, 3};
10406 PSHUFDMask[ADWord] = BDWord;
10407 PSHUFDMask[BDWord] = ADWord;
10408 V = DAG.getBitcast(
10410 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10411 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10413 // Adjust the mask to match the new locations of A and B.
10414 for (int &M : Mask)
10415 if (M >= 0 && M/2 == ADWord)
10416 M = 2 * BDWord + M % 2;
10417 else if (M >= 0 && M/2 == BDWord)
10418 M = 2 * ADWord + M % 2;
10420 // Recurse back into this routine to re-compute state now that this isn't
10421 // a 3 and 1 problem.
10422 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10425 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10426 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10427 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10428 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10430 // At this point there are at most two inputs to the low and high halves from
10431 // each half. That means the inputs can always be grouped into dwords and
10432 // those dwords can then be moved to the correct half with a dword shuffle.
10433 // We use at most one low and one high word shuffle to collect these paired
10434 // inputs into dwords, and finally a dword shuffle to place them.
10435 int PSHUFLMask[4] = {-1, -1, -1, -1};
10436 int PSHUFHMask[4] = {-1, -1, -1, -1};
10437 int PSHUFDMask[4] = {-1, -1, -1, -1};
10439 // First fix the masks for all the inputs that are staying in their
10440 // original halves. This will then dictate the targets of the cross-half
10442 auto fixInPlaceInputs =
10443 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10444 MutableArrayRef<int> SourceHalfMask,
10445 MutableArrayRef<int> HalfMask, int HalfOffset) {
10446 if (InPlaceInputs.empty())
10448 if (InPlaceInputs.size() == 1) {
10449 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10450 InPlaceInputs[0] - HalfOffset;
10451 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10454 if (IncomingInputs.empty()) {
10455 // Just fix all of the in place inputs.
10456 for (int Input : InPlaceInputs) {
10457 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10458 PSHUFDMask[Input / 2] = Input / 2;
10463 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10464 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10465 InPlaceInputs[0] - HalfOffset;
10466 // Put the second input next to the first so that they are packed into
10467 // a dword. We find the adjacent index by toggling the low bit.
10468 int AdjIndex = InPlaceInputs[0] ^ 1;
10469 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10470 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10471 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10473 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10474 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10476 // Now gather the cross-half inputs and place them into a free dword of
10477 // their target half.
10478 // FIXME: This operation could almost certainly be simplified dramatically to
10479 // look more like the 3-1 fixing operation.
10480 auto moveInputsToRightHalf = [&PSHUFDMask](
10481 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10482 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10483 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10485 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10486 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10488 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10490 int LowWord = Word & ~1;
10491 int HighWord = Word | 1;
10492 return isWordClobbered(SourceHalfMask, LowWord) ||
10493 isWordClobbered(SourceHalfMask, HighWord);
10496 if (IncomingInputs.empty())
10499 if (ExistingInputs.empty()) {
10500 // Map any dwords with inputs from them into the right half.
10501 for (int Input : IncomingInputs) {
10502 // If the source half mask maps over the inputs, turn those into
10503 // swaps and use the swapped lane.
10504 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10505 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10506 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10507 Input - SourceOffset;
10508 // We have to swap the uses in our half mask in one sweep.
10509 for (int &M : HalfMask)
10510 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10512 else if (M == Input)
10513 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10515 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10516 Input - SourceOffset &&
10517 "Previous placement doesn't match!");
10519 // Note that this correctly re-maps both when we do a swap and when
10520 // we observe the other side of the swap above. We rely on that to
10521 // avoid swapping the members of the input list directly.
10522 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10525 // Map the input's dword into the correct half.
10526 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10527 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10529 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10531 "Previous placement doesn't match!");
10534 // And just directly shift any other-half mask elements to be same-half
10535 // as we will have mirrored the dword containing the element into the
10536 // same position within that half.
10537 for (int &M : HalfMask)
10538 if (M >= SourceOffset && M < SourceOffset + 4) {
10539 M = M - SourceOffset + DestOffset;
10540 assert(M >= 0 && "This should never wrap below zero!");
10545 // Ensure we have the input in a viable dword of its current half. This
10546 // is particularly tricky because the original position may be clobbered
10547 // by inputs being moved and *staying* in that half.
10548 if (IncomingInputs.size() == 1) {
10549 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10550 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
10552 SourceHalfMask[InputFixed - SourceOffset] =
10553 IncomingInputs[0] - SourceOffset;
10554 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
10556 IncomingInputs[0] = InputFixed;
10558 } else if (IncomingInputs.size() == 2) {
10559 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
10560 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10561 // We have two non-adjacent or clobbered inputs we need to extract from
10562 // the source half. To do this, we need to map them into some adjacent
10563 // dword slot in the source mask.
10564 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
10565 IncomingInputs[1] - SourceOffset};
10567 // If there is a free slot in the source half mask adjacent to one of
10568 // the inputs, place the other input in it. We use (Index XOR 1) to
10569 // compute an adjacent index.
10570 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
10571 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
10572 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
10573 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10574 InputsFixed[1] = InputsFixed[0] ^ 1;
10575 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
10576 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
10577 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
10578 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
10579 InputsFixed[0] = InputsFixed[1] ^ 1;
10580 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
10581 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
10582 // The two inputs are in the same DWord but it is clobbered and the
10583 // adjacent DWord isn't used at all. Move both inputs to the free
10585 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
10586 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
10587 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
10588 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
10590 // The only way we hit this point is if there is no clobbering
10591 // (because there are no off-half inputs to this half) and there is no
10592 // free slot adjacent to one of the inputs. In this case, we have to
10593 // swap an input with a non-input.
10594 for (int i = 0; i < 4; ++i)
10595 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
10596 "We can't handle any clobbers here!");
10597 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
10598 "Cannot have adjacent inputs here!");
10600 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10601 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
10603 // We also have to update the final source mask in this case because
10604 // it may need to undo the above swap.
10605 for (int &M : FinalSourceHalfMask)
10606 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
10607 M = InputsFixed[1] + SourceOffset;
10608 else if (M == InputsFixed[1] + SourceOffset)
10609 M = (InputsFixed[0] ^ 1) + SourceOffset;
10611 InputsFixed[1] = InputsFixed[0] ^ 1;
10614 // Point everything at the fixed inputs.
10615 for (int &M : HalfMask)
10616 if (M == IncomingInputs[0])
10617 M = InputsFixed[0] + SourceOffset;
10618 else if (M == IncomingInputs[1])
10619 M = InputsFixed[1] + SourceOffset;
10621 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
10622 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
10625 llvm_unreachable("Unhandled input size!");
10628 // Now hoist the DWord down to the right half.
10629 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
10630 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
10631 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
10632 for (int &M : HalfMask)
10633 for (int Input : IncomingInputs)
10635 M = FreeDWord * 2 + Input % 2;
10637 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
10638 /*SourceOffset*/ 4, /*DestOffset*/ 0);
10639 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
10640 /*SourceOffset*/ 0, /*DestOffset*/ 4);
10642 // Now enact all the shuffles we've computed to move the inputs into their
10644 if (!isNoopShuffleMask(PSHUFLMask))
10645 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10646 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
10647 if (!isNoopShuffleMask(PSHUFHMask))
10648 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10649 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
10650 if (!isNoopShuffleMask(PSHUFDMask))
10651 V = DAG.getBitcast(
10653 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10654 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10656 // At this point, each half should contain all its inputs, and we can then
10657 // just shuffle them into their final position.
10658 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
10659 "Failed to lift all the high half inputs to the low mask!");
10660 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
10661 "Failed to lift all the low half inputs to the high mask!");
10663 // Do a half shuffle for the low mask.
10664 if (!isNoopShuffleMask(LoMask))
10665 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10666 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
10668 // Do a half shuffle with the high mask after shifting its values down.
10669 for (int &M : HiMask)
10672 if (!isNoopShuffleMask(HiMask))
10673 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10674 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
10679 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
10680 /// blend if only one input is used.
10681 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
10682 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10683 const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
10685 SDValue V1Mask[16];
10686 SDValue V2Mask[16];
10690 int Size = Mask.size();
10691 int Scale = 16 / Size;
10692 for (int i = 0; i < 16; ++i) {
10693 if (Mask[i / Scale] < 0) {
10694 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
10696 const int ZeroMask = 0x80;
10697 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
10699 int V2Idx = Mask[i / Scale] < Size
10701 : (Mask[i / Scale] - Size) * Scale + i % Scale;
10702 if (Zeroable[i / Scale])
10703 V1Idx = V2Idx = ZeroMask;
10704 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
10705 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
10706 V1InUse |= (ZeroMask != V1Idx);
10707 V2InUse |= (ZeroMask != V2Idx);
10712 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10713 DAG.getBitcast(MVT::v16i8, V1),
10714 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
10716 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10717 DAG.getBitcast(MVT::v16i8, V2),
10718 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
10720 // If we need shuffled inputs from both, blend the two.
10722 if (V1InUse && V2InUse)
10723 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
10725 V = V1InUse ? V1 : V2;
10727 // Cast the result back to the correct type.
10728 return DAG.getBitcast(VT, V);
10731 /// \brief Generic lowering of 8-lane i16 shuffles.
10733 /// This handles both single-input shuffles and combined shuffle/blends with
10734 /// two inputs. The single input shuffles are immediately delegated to
10735 /// a dedicated lowering routine.
10737 /// The blends are lowered in one of three fundamental ways. If there are few
10738 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
10739 /// of the input is significantly cheaper when lowered as an interleaving of
10740 /// the two inputs, try to interleave them. Otherwise, blend the low and high
10741 /// halves of the inputs separately (making them have relatively few inputs)
10742 /// and then concatenate them.
10743 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10744 const SmallBitVector &Zeroable,
10745 SDValue V1, SDValue V2,
10746 const X86Subtarget &Subtarget,
10747 SelectionDAG &DAG) {
10748 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10749 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10750 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10752 // Whenever we can lower this as a zext, that instruction is strictly faster
10753 // than any alternative.
10754 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10755 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10758 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
10760 if (NumV2Inputs == 0) {
10761 // Check for being able to broadcast a single element.
10762 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10763 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10766 // Try to use shift instructions.
10767 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
10768 Zeroable, Subtarget, DAG))
10771 // Use dedicated unpack instructions for masks that match their pattern.
10773 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10776 // Try to use byte rotation instructions.
10777 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
10778 Mask, Subtarget, DAG))
10781 // Make a copy of the mask so it can be modified.
10782 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
10783 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
10784 MutableMask, Subtarget,
10788 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
10789 "All single-input shuffles should be canonicalized to be V1-input "
10792 // Try to use shift instructions.
10793 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
10794 Zeroable, Subtarget, DAG))
10797 // See if we can use SSE4A Extraction / Insertion.
10798 if (Subtarget.hasSSE4A())
10799 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
10803 // There are special ways we can lower some single-element blends.
10804 if (NumV2Inputs == 1)
10805 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10806 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10809 // We have different paths for blend lowering, but they all must use the
10810 // *exact* same predicate.
10811 bool IsBlendSupported = Subtarget.hasSSE41();
10812 if (IsBlendSupported)
10813 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
10814 Zeroable, Subtarget, DAG))
10817 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
10821 // Use dedicated unpack instructions for masks that match their pattern.
10823 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10826 // Try to use byte rotation instructions.
10827 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10828 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10831 if (SDValue BitBlend =
10832 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10835 // Try to lower by permuting the inputs into an unpack instruction.
10836 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10840 // If we can't directly blend but can use PSHUFB, that will be better as it
10841 // can both shuffle and set up the inefficient blend.
10842 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10843 bool V1InUse, V2InUse;
10844 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
10845 Zeroable, DAG, V1InUse, V2InUse);
10848 // We can always bit-blend if we have to so the fallback strategy is to
10849 // decompose into single-input permutes and blends.
10850 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10854 /// \brief Check whether a compaction lowering can be done by dropping even
10855 /// elements and compute how many times even elements must be dropped.
10857 /// This handles shuffles which take every Nth element where N is a power of
10858 /// two. Example shuffle masks:
10860 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10861 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10862 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10863 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10864 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10865 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10867 /// Any of these lanes can of course be undef.
10869 /// This routine only supports N <= 3.
10870 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10873 /// \returns N above, or the number of times even elements must be dropped if
10874 /// there is such a number. Otherwise returns zero.
10875 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10876 bool IsSingleInput) {
10877 // The modulus for the shuffle vector entries is based on whether this is
10878 // a single input or not.
10879 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10880 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10881 "We should only be called with masks with a power-of-2 size!");
10883 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10885 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10886 // and 2^3 simultaneously. This is because we may have ambiguity with
10887 // partially undef inputs.
10888 bool ViableForN[3] = {true, true, true};
10890 for (int i = 0, e = Mask.size(); i < e; ++i) {
10891 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10896 bool IsAnyViable = false;
10897 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10898 if (ViableForN[j]) {
10899 uint64_t N = j + 1;
10901 // The shuffle mask must be equal to (i * 2^N) % M.
10902 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10903 IsAnyViable = true;
10905 ViableForN[j] = false;
10907 // Early exit if we exhaust the possible powers of two.
10912 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10916 // Return 0 as there is no viable power of two.
10920 /// \brief Generic lowering of v16i8 shuffles.
10922 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10923 /// detect any complexity reducing interleaving. If that doesn't help, it uses
10924 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10925 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10927 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10928 const SmallBitVector &Zeroable,
10929 SDValue V1, SDValue V2,
10930 const X86Subtarget &Subtarget,
10931 SelectionDAG &DAG) {
10932 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10933 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10934 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10936 // Try to use shift instructions.
10937 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10938 Zeroable, Subtarget, DAG))
10941 // Try to use byte rotation instructions.
10942 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10943 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10946 // Try to use a zext lowering.
10947 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10948 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
10951 // See if we can use SSE4A Extraction / Insertion.
10952 if (Subtarget.hasSSE4A())
10953 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
10957 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10959 // For single-input shuffles, there are some nicer lowering tricks we can use.
10960 if (NumV2Elements == 0) {
10961 // Check for being able to broadcast a single element.
10962 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10963 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10966 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10967 // Notably, this handles splat and partial-splat shuffles more efficiently.
10968 // However, it only makes sense if the pre-duplication shuffle simplifies
10969 // things significantly. Currently, this means we need to be able to
10970 // express the pre-duplication shuffle as an i16 shuffle.
10972 // FIXME: We should check for other patterns which can be widened into an
10973 // i16 shuffle as well.
10974 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10975 for (int i = 0; i < 16; i += 2)
10976 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10981 auto tryToWidenViaDuplication = [&]() -> SDValue {
10982 if (!canWidenViaDuplication(Mask))
10984 SmallVector<int, 4> LoInputs;
10985 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10986 [](int M) { return M >= 0 && M < 8; });
10987 std::sort(LoInputs.begin(), LoInputs.end());
10988 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10990 SmallVector<int, 4> HiInputs;
10991 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10992 [](int M) { return M >= 8; });
10993 std::sort(HiInputs.begin(), HiInputs.end());
10994 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10997 bool TargetLo = LoInputs.size() >= HiInputs.size();
10998 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10999 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11001 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11002 SmallDenseMap<int, int, 8> LaneMap;
11003 for (int I : InPlaceInputs) {
11004 PreDupI16Shuffle[I/2] = I/2;
11007 int j = TargetLo ? 0 : 4, je = j + 4;
11008 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11009 // Check if j is already a shuffle of this input. This happens when
11010 // there are two adjacent bytes after we move the low one.
11011 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11012 // If we haven't yet mapped the input, search for a slot into which
11014 while (j < je && PreDupI16Shuffle[j] >= 0)
11018 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11021 // Map this input with the i16 shuffle.
11022 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11025 // Update the lane map based on the mapping we ended up with.
11026 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11028 V1 = DAG.getBitcast(
11030 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11031 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11033 // Unpack the bytes to form the i16s that will be shuffled into place.
11034 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11035 MVT::v16i8, V1, V1);
11037 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11038 for (int i = 0; i < 16; ++i)
11039 if (Mask[i] >= 0) {
11040 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11041 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11042 if (PostDupI16Shuffle[i / 2] < 0)
11043 PostDupI16Shuffle[i / 2] = MappedMask;
11045 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11046 "Conflicting entrties in the original shuffle!");
11048 return DAG.getBitcast(
11050 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11051 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11053 if (SDValue V = tryToWidenViaDuplication())
11057 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11061 // Use dedicated unpack instructions for masks that match their pattern.
11063 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11066 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11067 // with PSHUFB. It is important to do this before we attempt to generate any
11068 // blends but after all of the single-input lowerings. If the single input
11069 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11070 // want to preserve that and we can DAG combine any longer sequences into
11071 // a PSHUFB in the end. But once we start blending from multiple inputs,
11072 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11073 // and there are *very* few patterns that would actually be faster than the
11074 // PSHUFB approach because of its ability to zero lanes.
11076 // FIXME: The only exceptions to the above are blends which are exact
11077 // interleavings with direct instructions supporting them. We currently don't
11078 // handle those well here.
11079 if (Subtarget.hasSSSE3()) {
11080 bool V1InUse = false;
11081 bool V2InUse = false;
11083 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11084 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11086 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11087 // do so. This avoids using them to handle blends-with-zero which is
11088 // important as a single pshufb is significantly faster for that.
11089 if (V1InUse && V2InUse) {
11090 if (Subtarget.hasSSE41())
11091 if (SDValue Blend = lowerVectorShuffleAsBlend(
11092 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11095 // We can use an unpack to do the blending rather than an or in some
11096 // cases. Even though the or may be (very minorly) more efficient, we
11097 // preference this lowering because there are common cases where part of
11098 // the complexity of the shuffles goes away when we do the final blend as
11100 // FIXME: It might be worth trying to detect if the unpack-feeding
11101 // shuffles will both be pshufb, in which case we shouldn't bother with
11103 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11104 DL, MVT::v16i8, V1, V2, Mask, DAG))
11111 // There are special ways we can lower some single-element blends.
11112 if (NumV2Elements == 1)
11113 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11114 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11117 if (SDValue BitBlend =
11118 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11121 // Check whether a compaction lowering can be done. This handles shuffles
11122 // which take every Nth element for some even N. See the helper function for
11125 // We special case these as they can be particularly efficiently handled with
11126 // the PACKUSB instruction on x86 and they show up in common patterns of
11127 // rearranging bytes to truncate wide elements.
11128 bool IsSingleInput = V2.isUndef();
11129 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11130 // NumEvenDrops is the power of two stride of the elements. Another way of
11131 // thinking about it is that we need to drop the even elements this many
11132 // times to get the original input.
11134 // First we need to zero all the dropped bytes.
11135 assert(NumEvenDrops <= 3 &&
11136 "No support for dropping even elements more than 3 times.");
11137 // We use the mask type to pick which bytes are preserved based on how many
11138 // elements are dropped.
11139 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11140 SDValue ByteClearMask = DAG.getBitcast(
11141 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11142 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11143 if (!IsSingleInput)
11144 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11146 // Now pack things back together.
11147 V1 = DAG.getBitcast(MVT::v8i16, V1);
11148 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11149 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11150 for (int i = 1; i < NumEvenDrops; ++i) {
11151 Result = DAG.getBitcast(MVT::v8i16, Result);
11152 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11158 // Handle multi-input cases by blending single-input shuffles.
11159 if (NumV2Elements > 0)
11160 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11163 // The fallback path for single-input shuffles widens this into two v8i16
11164 // vectors with unpacks, shuffles those, and then pulls them back together
11168 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11169 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11170 for (int i = 0; i < 16; ++i)
11172 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11174 SDValue VLoHalf, VHiHalf;
11175 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11176 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11178 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11179 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11180 // Use a mask to drop the high bytes.
11181 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11182 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11183 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11185 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11186 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11188 // Squash the masks to point directly into VLoHalf.
11189 for (int &M : LoBlendMask)
11192 for (int &M : HiBlendMask)
11196 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11197 // VHiHalf so that we can blend them as i16s.
11198 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11200 VLoHalf = DAG.getBitcast(
11201 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11202 VHiHalf = DAG.getBitcast(
11203 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11206 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11207 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11209 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11212 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11214 /// This routine breaks down the specific type of 128-bit shuffle and
11215 /// dispatches to the lowering routines accordingly.
11216 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11217 MVT VT, SDValue V1, SDValue V2,
11218 const SmallBitVector &Zeroable,
11219 const X86Subtarget &Subtarget,
11220 SelectionDAG &DAG) {
11221 switch (VT.SimpleTy) {
11223 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11225 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11227 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11229 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11231 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11233 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11236 llvm_unreachable("Unimplemented!");
11240 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11242 /// This routine just extracts two subvectors, shuffles them independently, and
11243 /// then concatenates them back together. This should work effectively with all
11244 /// AVX vector shuffle types.
11245 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11246 SDValue V2, ArrayRef<int> Mask,
11247 SelectionDAG &DAG) {
11248 assert(VT.getSizeInBits() >= 256 &&
11249 "Only for 256-bit or wider vector shuffles!");
11250 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11251 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11253 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11254 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11256 int NumElements = VT.getVectorNumElements();
11257 int SplitNumElements = NumElements / 2;
11258 MVT ScalarVT = VT.getVectorElementType();
11259 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11261 // Rather than splitting build-vectors, just build two narrower build
11262 // vectors. This helps shuffling with splats and zeros.
11263 auto SplitVector = [&](SDValue V) {
11264 V = peekThroughBitcasts(V);
11266 MVT OrigVT = V.getSimpleValueType();
11267 int OrigNumElements = OrigVT.getVectorNumElements();
11268 int OrigSplitNumElements = OrigNumElements / 2;
11269 MVT OrigScalarVT = OrigVT.getVectorElementType();
11270 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11274 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11276 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11277 DAG.getIntPtrConstant(0, DL));
11278 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11279 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11282 SmallVector<SDValue, 16> LoOps, HiOps;
11283 for (int i = 0; i < OrigSplitNumElements; ++i) {
11284 LoOps.push_back(BV->getOperand(i));
11285 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11287 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11288 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11290 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11291 DAG.getBitcast(SplitVT, HiV));
11294 SDValue LoV1, HiV1, LoV2, HiV2;
11295 std::tie(LoV1, HiV1) = SplitVector(V1);
11296 std::tie(LoV2, HiV2) = SplitVector(V2);
11298 // Now create two 4-way blends of these half-width vectors.
11299 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11300 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11301 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11302 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11303 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11304 for (int i = 0; i < SplitNumElements; ++i) {
11305 int M = HalfMask[i];
11306 if (M >= NumElements) {
11307 if (M >= NumElements + SplitNumElements)
11311 V2BlendMask[i] = M - NumElements;
11312 BlendMask[i] = SplitNumElements + i;
11313 } else if (M >= 0) {
11314 if (M >= SplitNumElements)
11318 V1BlendMask[i] = M;
11323 // Because the lowering happens after all combining takes place, we need to
11324 // manually combine these blend masks as much as possible so that we create
11325 // a minimal number of high-level vector shuffle nodes.
11327 // First try just blending the halves of V1 or V2.
11328 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11329 return DAG.getUNDEF(SplitVT);
11330 if (!UseLoV2 && !UseHiV2)
11331 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11332 if (!UseLoV1 && !UseHiV1)
11333 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11335 SDValue V1Blend, V2Blend;
11336 if (UseLoV1 && UseHiV1) {
11338 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11340 // We only use half of V1 so map the usage down into the final blend mask.
11341 V1Blend = UseLoV1 ? LoV1 : HiV1;
11342 for (int i = 0; i < SplitNumElements; ++i)
11343 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11344 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11346 if (UseLoV2 && UseHiV2) {
11348 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11350 // We only use half of V2 so map the usage down into the final blend mask.
11351 V2Blend = UseLoV2 ? LoV2 : HiV2;
11352 for (int i = 0; i < SplitNumElements; ++i)
11353 if (BlendMask[i] >= SplitNumElements)
11354 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11356 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11358 SDValue Lo = HalfBlend(LoMask);
11359 SDValue Hi = HalfBlend(HiMask);
11360 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11363 /// \brief Either split a vector in halves or decompose the shuffles and the
11366 /// This is provided as a good fallback for many lowerings of non-single-input
11367 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11368 /// between splitting the shuffle into 128-bit components and stitching those
11369 /// back together vs. extracting the single-input shuffles and blending those
11371 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11372 SDValue V1, SDValue V2,
11373 ArrayRef<int> Mask,
11374 SelectionDAG &DAG) {
11375 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11376 "shuffles as it could then recurse on itself.");
11377 int Size = Mask.size();
11379 // If this can be modeled as a broadcast of two elements followed by a blend,
11380 // prefer that lowering. This is especially important because broadcasts can
11381 // often fold with memory operands.
11382 auto DoBothBroadcast = [&] {
11383 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11386 if (V2BroadcastIdx < 0)
11387 V2BroadcastIdx = M - Size;
11388 else if (M - Size != V2BroadcastIdx)
11390 } else if (M >= 0) {
11391 if (V1BroadcastIdx < 0)
11392 V1BroadcastIdx = M;
11393 else if (M != V1BroadcastIdx)
11398 if (DoBothBroadcast())
11399 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11402 // If the inputs all stem from a single 128-bit lane of each input, then we
11403 // split them rather than blending because the split will decompose to
11404 // unusually few instructions.
11405 int LaneCount = VT.getSizeInBits() / 128;
11406 int LaneSize = Size / LaneCount;
11407 SmallBitVector LaneInputs[2];
11408 LaneInputs[0].resize(LaneCount, false);
11409 LaneInputs[1].resize(LaneCount, false);
11410 for (int i = 0; i < Size; ++i)
11412 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11413 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11414 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11416 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11417 // that the decomposed single-input shuffles don't end up here.
11418 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11421 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11422 /// a permutation and blend of those lanes.
11424 /// This essentially blends the out-of-lane inputs to each lane into the lane
11425 /// from a permuted copy of the vector. This lowering strategy results in four
11426 /// instructions in the worst case for a single-input cross lane shuffle which
11427 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11428 /// of. Special cases for each particular shuffle pattern should be handled
11429 /// prior to trying this lowering.
11430 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11431 SDValue V1, SDValue V2,
11432 ArrayRef<int> Mask,
11433 SelectionDAG &DAG) {
11434 // FIXME: This should probably be generalized for 512-bit vectors as well.
11435 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11436 int Size = Mask.size();
11437 int LaneSize = Size / 2;
11439 // If there are only inputs from one 128-bit lane, splitting will in fact be
11440 // less expensive. The flags track whether the given lane contains an element
11441 // that crosses to another lane.
11442 bool LaneCrossing[2] = {false, false};
11443 for (int i = 0; i < Size; ++i)
11444 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11445 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11446 if (!LaneCrossing[0] || !LaneCrossing[1])
11447 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11449 assert(V2.isUndef() &&
11450 "This last part of this routine only works on single input shuffles");
11452 SmallVector<int, 32> FlippedBlendMask(Size);
11453 for (int i = 0; i < Size; ++i)
11454 FlippedBlendMask[i] =
11455 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11457 : Mask[i] % LaneSize +
11458 (i / LaneSize) * LaneSize + Size);
11460 // Flip the vector, and blend the results which should now be in-lane. The
11461 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11462 // 5 for the high source. The value 3 selects the high half of source 2 and
11463 // the value 2 selects the low half of source 2. We only use source 2 to
11464 // allow folding it into a memory operand.
11465 unsigned PERMMask = 3 | 2 << 4;
11466 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11467 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11468 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11471 /// \brief Handle lowering 2-lane 128-bit shuffles.
11472 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11473 SDValue V2, ArrayRef<int> Mask,
11474 const SmallBitVector &Zeroable,
11475 const X86Subtarget &Subtarget,
11476 SelectionDAG &DAG) {
11477 // TODO: If minimizing size and one of the inputs is a zero vector and the
11478 // the zero vector has only one use, we could use a VPERM2X128 to save the
11479 // instruction bytes needed to explicitly generate the zero vector.
11481 // Blends are faster and handle all the non-lane-crossing cases.
11482 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11483 Zeroable, Subtarget, DAG))
11486 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11487 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11489 // If either input operand is a zero vector, use VPERM2X128 because its mask
11490 // allows us to replace the zero input with an implicit zero.
11491 if (!IsV1Zero && !IsV2Zero) {
11492 // Check for patterns which can be matched with a single insert of a 128-bit
11494 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11495 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11496 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11497 if (Subtarget.hasAVX2() && V2.isUndef())
11500 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11501 VT.getVectorNumElements() / 2);
11502 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11503 DAG.getIntPtrConstant(0, DL));
11504 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11505 OnlyUsesV1 ? V1 : V2,
11506 DAG.getIntPtrConstant(0, DL));
11507 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11511 // Otherwise form a 128-bit permutation. After accounting for undefs,
11512 // convert the 64-bit shuffle mask selection values into 128-bit
11513 // selection bits by dividing the indexes by 2 and shifting into positions
11514 // defined by a vperm2*128 instruction's immediate control byte.
11516 // The immediate permute control byte looks like this:
11517 // [1:0] - select 128 bits from sources for low half of destination
11519 // [3] - zero low half of destination
11520 // [5:4] - select 128 bits from sources for high half of destination
11522 // [7] - zero high half of destination
11524 int MaskLO = Mask[0];
11525 if (MaskLO == SM_SentinelUndef)
11526 MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
11528 int MaskHI = Mask[2];
11529 if (MaskHI == SM_SentinelUndef)
11530 MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
11532 unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
11534 // If either input is a zero vector, replace it with an undef input.
11535 // Shuffle mask values < 4 are selecting elements of V1.
11536 // Shuffle mask values >= 4 are selecting elements of V2.
11537 // Adjust each half of the permute mask by clearing the half that was
11538 // selecting the zero vector and setting the zero mask bit.
11540 V1 = DAG.getUNDEF(VT);
11542 PermMask = (PermMask & 0xf0) | 0x08;
11544 PermMask = (PermMask & 0x0f) | 0x80;
11547 V2 = DAG.getUNDEF(VT);
11549 PermMask = (PermMask & 0xf0) | 0x08;
11551 PermMask = (PermMask & 0x0f) | 0x80;
11554 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
11555 DAG.getConstant(PermMask, DL, MVT::i8));
11558 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
11559 /// shuffling each lane.
11561 /// This will only succeed when the result of fixing the 128-bit lanes results
11562 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
11563 /// each 128-bit lanes. This handles many cases where we can quickly blend away
11564 /// the lane crosses early and then use simpler shuffles within each lane.
11566 /// FIXME: It might be worthwhile at some point to support this without
11567 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
11568 /// in x86 only floating point has interesting non-repeating shuffles, and even
11569 /// those are still *marginally* more expensive.
11570 static SDValue lowerVectorShuffleByMerging128BitLanes(
11571 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11572 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11573 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
11575 int Size = Mask.size();
11576 int LaneSize = 128 / VT.getScalarSizeInBits();
11577 int NumLanes = Size / LaneSize;
11578 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
11580 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
11581 // check whether the in-128-bit lane shuffles share a repeating pattern.
11582 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
11583 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
11584 for (int i = 0; i < Size; ++i) {
11588 int j = i / LaneSize;
11590 if (Lanes[j] < 0) {
11591 // First entry we've seen for this lane.
11592 Lanes[j] = Mask[i] / LaneSize;
11593 } else if (Lanes[j] != Mask[i] / LaneSize) {
11594 // This doesn't match the lane selected previously!
11598 // Check that within each lane we have a consistent shuffle mask.
11599 int k = i % LaneSize;
11600 if (InLaneMask[k] < 0) {
11601 InLaneMask[k] = Mask[i] % LaneSize;
11602 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
11603 // This doesn't fit a repeating in-lane mask.
11608 // First shuffle the lanes into place.
11609 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
11610 VT.getSizeInBits() / 64);
11611 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
11612 for (int i = 0; i < NumLanes; ++i)
11613 if (Lanes[i] >= 0) {
11614 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
11615 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
11618 V1 = DAG.getBitcast(LaneVT, V1);
11619 V2 = DAG.getBitcast(LaneVT, V2);
11620 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
11622 // Cast it back to the type we actually want.
11623 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
11625 // Now do a simple shuffle that isn't lane crossing.
11626 SmallVector<int, 8> NewMask((unsigned)Size, -1);
11627 for (int i = 0; i < Size; ++i)
11629 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
11630 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
11631 "Must not introduce lane crosses at this point!");
11633 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
11636 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
11637 /// This allows for fast cases such as subvector extraction/insertion
11638 /// or shuffling smaller vector types which can lower more efficiently.
11639 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
11640 SDValue V1, SDValue V2,
11641 ArrayRef<int> Mask,
11642 const X86Subtarget &Subtarget,
11643 SelectionDAG &DAG) {
11644 assert(VT.is256BitVector() && "Expected 256-bit vector");
11646 unsigned NumElts = VT.getVectorNumElements();
11647 unsigned HalfNumElts = NumElts / 2;
11648 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
11650 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
11651 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
11652 if (!UndefLower && !UndefUpper)
11655 // Upper half is undef and lower half is whole upper subvector.
11656 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
11658 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
11659 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11660 DAG.getIntPtrConstant(HalfNumElts, DL));
11661 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11662 DAG.getIntPtrConstant(0, DL));
11665 // Lower half is undef and upper half is whole lower subvector.
11666 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
11668 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
11669 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11670 DAG.getIntPtrConstant(0, DL));
11671 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11672 DAG.getIntPtrConstant(HalfNumElts, DL));
11675 // If the shuffle only uses two of the four halves of the input operands,
11676 // then extract them and perform the 'half' shuffle at half width.
11677 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
11678 int HalfIdx1 = -1, HalfIdx2 = -1;
11679 SmallVector<int, 8> HalfMask(HalfNumElts);
11680 unsigned Offset = UndefLower ? HalfNumElts : 0;
11681 for (unsigned i = 0; i != HalfNumElts; ++i) {
11682 int M = Mask[i + Offset];
11688 // Determine which of the 4 half vectors this element is from.
11689 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
11690 int HalfIdx = M / HalfNumElts;
11692 // Determine the element index into its half vector source.
11693 int HalfElt = M % HalfNumElts;
11695 // We can shuffle with up to 2 half vectors, set the new 'half'
11696 // shuffle mask accordingly.
11697 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
11698 HalfMask[i] = HalfElt;
11699 HalfIdx1 = HalfIdx;
11702 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
11703 HalfMask[i] = HalfElt + HalfNumElts;
11704 HalfIdx2 = HalfIdx;
11708 // Too many half vectors referenced.
11711 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
11713 // Only shuffle the halves of the inputs when useful.
11714 int NumLowerHalves =
11715 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
11716 int NumUpperHalves =
11717 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
11719 // uuuuXXXX - don't extract uppers just to insert again.
11720 if (UndefLower && NumUpperHalves != 0)
11723 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
11724 if (UndefUpper && NumUpperHalves == 2)
11727 // AVX2 - XXXXuuuu - always extract lowers.
11728 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
11729 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
11730 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11732 // AVX2 supports variable 32-bit element cross-lane shuffles.
11733 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
11734 // XXXXuuuu - don't extract lowers and uppers.
11735 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
11740 auto GetHalfVector = [&](int HalfIdx) {
11742 return DAG.getUNDEF(HalfVT);
11743 SDValue V = (HalfIdx < 2 ? V1 : V2);
11744 HalfIdx = (HalfIdx % 2) * HalfNumElts;
11745 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
11746 DAG.getIntPtrConstant(HalfIdx, DL));
11749 SDValue Half1 = GetHalfVector(HalfIdx1);
11750 SDValue Half2 = GetHalfVector(HalfIdx2);
11751 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
11752 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
11753 DAG.getIntPtrConstant(Offset, DL));
11756 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
11759 /// This returns true if the elements from a particular input are already in the
11760 /// slot required by the given mask and require no permutation.
11761 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
11762 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
11763 int Size = Mask.size();
11764 for (int i = 0; i < Size; ++i)
11765 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
11771 /// Handle case where shuffle sources are coming from the same 128-bit lane and
11772 /// every lane can be represented as the same repeating mask - allowing us to
11773 /// shuffle the sources with the repeating shuffle and then permute the result
11774 /// to the destination lanes.
11775 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
11776 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11777 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11778 int NumElts = VT.getVectorNumElements();
11779 int NumLanes = VT.getSizeInBits() / 128;
11780 int NumLaneElts = NumElts / NumLanes;
11782 // On AVX2 we may be able to just shuffle the lowest elements and then
11783 // broadcast the result.
11784 if (Subtarget.hasAVX2()) {
11785 for (unsigned BroadcastSize : {16, 32, 64}) {
11786 if (BroadcastSize <= VT.getScalarSizeInBits())
11788 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11790 // Attempt to match a repeating pattern every NumBroadcastElts,
11791 // accounting for UNDEFs but only references the lowest 128-bit
11792 // lane of the inputs.
11793 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11794 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11795 for (int j = 0; j != NumBroadcastElts; ++j) {
11796 int M = Mask[i + j];
11799 int &R = RepeatMask[j];
11800 if (0 != ((M % NumElts) / NumLaneElts))
11802 if (0 <= R && R != M)
11809 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11810 if (!FindRepeatingBroadcastMask(RepeatMask))
11813 // Shuffle the (lowest) repeated elements in place for broadcast.
11814 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11816 // Shuffle the actual broadcast.
11817 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11818 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11819 for (int j = 0; j != NumBroadcastElts; ++j)
11820 BroadcastMask[i + j] = j;
11821 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11826 // Bail if the shuffle mask doesn't cross 128-bit lanes.
11827 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
11830 // Bail if we already have a repeated lane shuffle mask.
11831 SmallVector<int, 8> RepeatedShuffleMask;
11832 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11835 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11836 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11837 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11838 int NumSubLanes = NumLanes * SubLaneScale;
11839 int NumSubLaneElts = NumLaneElts / SubLaneScale;
11841 // Check that all the sources are coming from the same lane and see if we can
11842 // form a repeating shuffle mask (local to each sub-lane). At the same time,
11843 // determine the source sub-lane for each destination sub-lane.
11844 int TopSrcSubLane = -1;
11845 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11846 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
11847 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
11848 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
11850 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
11851 // Extract the sub-lane mask, check that it all comes from the same lane
11852 // and normalize the mask entries to come from the first lane.
11854 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
11855 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11856 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
11859 int Lane = (M % NumElts) / NumLaneElts;
11860 if ((0 <= SrcLane) && (SrcLane != Lane))
11863 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
11864 SubLaneMask[Elt] = LocalM;
11867 // Whole sub-lane is UNDEF.
11871 // Attempt to match against the candidate repeated sub-lane masks.
11872 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
11873 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
11874 for (int i = 0; i != NumSubLaneElts; ++i) {
11875 if (M1[i] < 0 || M2[i] < 0)
11877 if (M1[i] != M2[i])
11883 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
11884 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
11887 // Merge the sub-lane mask into the matching repeated sub-lane mask.
11888 for (int i = 0; i != NumSubLaneElts; ++i) {
11889 int M = SubLaneMask[i];
11892 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
11893 "Unexpected mask element");
11894 RepeatedSubLaneMask[i] = M;
11897 // Track the top most source sub-lane - by setting the remaining to UNDEF
11898 // we can greatly simplify shuffle matching.
11899 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
11900 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11901 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
11905 // Bail if we failed to find a matching repeated sub-lane mask.
11906 if (Dst2SrcSubLanes[DstSubLane] < 0)
11909 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11910 "Unexpected source lane");
11912 // Create a repeating shuffle mask for the entire vector.
11913 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11914 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
11915 int Lane = SubLane / SubLaneScale;
11916 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
11917 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11918 int M = RepeatedSubLaneMask[Elt];
11921 int Idx = (SubLane * NumSubLaneElts) + Elt;
11922 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
11925 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11927 // Shuffle each source sub-lane to its destination.
11928 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11929 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11930 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11931 if (SrcSubLane < 0)
11933 for (int j = 0; j != NumSubLaneElts; ++j)
11934 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11937 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11941 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
11942 unsigned &ShuffleImm,
11943 ArrayRef<int> Mask) {
11944 int NumElts = VT.getVectorNumElements();
11945 assert(VT.getScalarType() == MVT::f64 &&
11946 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
11947 "Unexpected data type for VSHUFPD");
11949 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
11950 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
11952 bool ShufpdMask = true;
11953 bool CommutableMask = true;
11954 for (int i = 0; i < NumElts; ++i) {
11955 if (Mask[i] == SM_SentinelUndef)
11959 int Val = (i & 6) + NumElts * (i & 1);
11960 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
11961 if (Mask[i] < Val || Mask[i] > Val + 1)
11962 ShufpdMask = false;
11963 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
11964 CommutableMask = false;
11965 ShuffleImm |= (Mask[i] % 2) << i;
11970 if (CommutableMask) {
11978 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11979 ArrayRef<int> Mask, SDValue V1,
11980 SDValue V2, SelectionDAG &DAG) {
11981 unsigned Immediate = 0;
11982 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
11985 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11986 DAG.getConstant(Immediate, DL, MVT::i8));
11989 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11990 ArrayRef<int> Mask, SDValue V1,
11991 SDValue V2, SelectionDAG &DAG) {
11992 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11993 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11995 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11997 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11999 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12002 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12004 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12005 /// isn't available.
12006 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12007 const SmallBitVector &Zeroable,
12008 SDValue V1, SDValue V2,
12009 const X86Subtarget &Subtarget,
12010 SelectionDAG &DAG) {
12011 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12012 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12013 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12015 SmallVector<int, 4> WidenedMask;
12016 if (canWidenShuffleElements(Mask, WidenedMask))
12017 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12018 Zeroable, Subtarget, DAG))
12021 if (V2.isUndef()) {
12022 // Check for being able to broadcast a single element.
12023 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12024 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12027 // Use low duplicate instructions for masks that match their pattern.
12028 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12029 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12031 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12032 // Non-half-crossing single input shuffles can be lowered with an
12033 // interleaved permutation.
12034 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12035 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12036 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12037 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12040 // With AVX2 we have direct support for this permutation.
12041 if (Subtarget.hasAVX2())
12042 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12043 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12045 // Try to create an in-lane repeating shuffle mask and then shuffle the
12046 // the results into the target lanes.
12047 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12048 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12051 // Otherwise, fall back.
12052 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12056 // Use dedicated unpack instructions for masks that match their pattern.
12058 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12061 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12062 Zeroable, Subtarget, DAG))
12065 // Check if the blend happens to exactly fit that of SHUFPD.
12067 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12070 // Try to create an in-lane repeating shuffle mask and then shuffle the
12071 // the results into the target lanes.
12072 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12073 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12076 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12077 // shuffle. However, if we have AVX2 and either inputs are already in place,
12078 // we will be able to shuffle even across lanes the other input in a single
12079 // instruction so skip this pattern.
12080 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12081 isShuffleMaskInputInPlace(1, Mask))))
12082 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12083 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12086 // If we have AVX2 then we always want to lower with a blend because an v4 we
12087 // can fully permute the elements.
12088 if (Subtarget.hasAVX2())
12089 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12092 // Otherwise fall back on generic lowering.
12093 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12096 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12098 /// This routine is only called when we have AVX2 and thus a reasonable
12099 /// instruction set for v4i64 shuffling..
12100 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12101 const SmallBitVector &Zeroable,
12102 SDValue V1, SDValue V2,
12103 const X86Subtarget &Subtarget,
12104 SelectionDAG &DAG) {
12105 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12106 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12107 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12108 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12110 SmallVector<int, 4> WidenedMask;
12111 if (canWidenShuffleElements(Mask, WidenedMask))
12112 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12113 Zeroable, Subtarget, DAG))
12116 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12117 Zeroable, Subtarget, DAG))
12120 // Check for being able to broadcast a single element.
12121 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12122 Mask, Subtarget, DAG))
12125 if (V2.isUndef()) {
12126 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12127 // can use lower latency instructions that will operate on both lanes.
12128 SmallVector<int, 2> RepeatedMask;
12129 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12130 SmallVector<int, 4> PSHUFDMask;
12131 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12132 return DAG.getBitcast(
12134 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12135 DAG.getBitcast(MVT::v8i32, V1),
12136 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12139 // AVX2 provides a direct instruction for permuting a single input across
12141 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12142 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12145 // Try to use shift instructions.
12146 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12147 Zeroable, Subtarget, DAG))
12150 // If we have VLX support, we can use VALIGN.
12151 if (Subtarget.hasVLX())
12152 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12153 Mask, Subtarget, DAG))
12156 // Try to use PALIGNR.
12157 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12158 Mask, Subtarget, DAG))
12161 // Use dedicated unpack instructions for masks that match their pattern.
12163 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12166 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12167 // shuffle. However, if we have AVX2 and either inputs are already in place,
12168 // we will be able to shuffle even across lanes the other input in a single
12169 // instruction so skip this pattern.
12170 if (!isShuffleMaskInputInPlace(0, Mask) &&
12171 !isShuffleMaskInputInPlace(1, Mask))
12172 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12173 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12176 // Otherwise fall back on generic blend lowering.
12177 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12181 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12183 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12184 /// isn't available.
12185 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12186 const SmallBitVector &Zeroable,
12187 SDValue V1, SDValue V2,
12188 const X86Subtarget &Subtarget,
12189 SelectionDAG &DAG) {
12190 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12191 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12192 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12194 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12195 Zeroable, Subtarget, DAG))
12198 // Check for being able to broadcast a single element.
12199 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12200 Mask, Subtarget, DAG))
12203 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12204 // options to efficiently lower the shuffle.
12205 SmallVector<int, 4> RepeatedMask;
12206 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12207 assert(RepeatedMask.size() == 4 &&
12208 "Repeated masks must be half the mask width!");
12210 // Use even/odd duplicate instructions for masks that match their pattern.
12211 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12212 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12213 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12214 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12217 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12218 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12220 // Use dedicated unpack instructions for masks that match their pattern.
12222 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12225 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12226 // have already handled any direct blends.
12227 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12230 // Try to create an in-lane repeating shuffle mask and then shuffle the
12231 // the results into the target lanes.
12232 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12233 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12236 // If we have a single input shuffle with different shuffle patterns in the
12237 // two 128-bit lanes use the variable mask to VPERMILPS.
12238 if (V2.isUndef()) {
12239 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12240 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12241 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12243 if (Subtarget.hasAVX2())
12244 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12246 // Otherwise, fall back.
12247 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12251 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12253 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12254 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12257 // If we have AVX2 then we always want to lower with a blend because at v8 we
12258 // can fully permute the elements.
12259 if (Subtarget.hasAVX2())
12260 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12263 // Otherwise fall back on generic lowering.
12264 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12267 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12269 /// This routine is only called when we have AVX2 and thus a reasonable
12270 /// instruction set for v8i32 shuffling..
12271 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12272 const SmallBitVector &Zeroable,
12273 SDValue V1, SDValue V2,
12274 const X86Subtarget &Subtarget,
12275 SelectionDAG &DAG) {
12276 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12277 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12278 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12279 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12281 // Whenever we can lower this as a zext, that instruction is strictly faster
12282 // than any alternative. It also allows us to fold memory operands into the
12283 // shuffle in many cases.
12284 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12285 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12288 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12289 Zeroable, Subtarget, DAG))
12292 // Check for being able to broadcast a single element.
12293 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12294 Mask, Subtarget, DAG))
12297 // If the shuffle mask is repeated in each 128-bit lane we can use more
12298 // efficient instructions that mirror the shuffles across the two 128-bit
12300 SmallVector<int, 4> RepeatedMask;
12301 bool Is128BitLaneRepeatedShuffle =
12302 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12303 if (Is128BitLaneRepeatedShuffle) {
12304 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12306 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12307 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12309 // Use dedicated unpack instructions for masks that match their pattern.
12311 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12315 // Try to use shift instructions.
12316 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12317 Zeroable, Subtarget, DAG))
12320 // If we have VLX support, we can use VALIGN.
12321 if (Subtarget.hasVLX())
12322 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12323 Mask, Subtarget, DAG))
12326 // Try to use byte rotation instructions.
12327 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12328 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12331 // Try to create an in-lane repeating shuffle mask and then shuffle the
12332 // results into the target lanes.
12333 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12334 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12337 // If the shuffle patterns aren't repeated but it is a single input, directly
12338 // generate a cross-lane VPERMD instruction.
12339 if (V2.isUndef()) {
12340 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12341 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12344 // Assume that a single SHUFPS is faster than an alternative sequence of
12345 // multiple instructions (even if the CPU has a domain penalty).
12346 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12347 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12348 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12349 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12350 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12351 CastV1, CastV2, DAG);
12352 return DAG.getBitcast(MVT::v8i32, ShufPS);
12355 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12357 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12358 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12361 // Otherwise fall back on generic blend lowering.
12362 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12366 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12368 /// This routine is only called when we have AVX2 and thus a reasonable
12369 /// instruction set for v16i16 shuffling..
12370 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12371 const SmallBitVector &Zeroable,
12372 SDValue V1, SDValue V2,
12373 const X86Subtarget &Subtarget,
12374 SelectionDAG &DAG) {
12375 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12376 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12377 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12378 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12380 // Whenever we can lower this as a zext, that instruction is strictly faster
12381 // than any alternative. It also allows us to fold memory operands into the
12382 // shuffle in many cases.
12383 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12384 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12387 // Check for being able to broadcast a single element.
12388 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12389 Mask, Subtarget, DAG))
12392 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12393 Zeroable, Subtarget, DAG))
12396 // Use dedicated unpack instructions for masks that match their pattern.
12398 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12401 // Try to use shift instructions.
12402 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12403 Zeroable, Subtarget, DAG))
12406 // Try to use byte rotation instructions.
12407 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12408 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12411 // Try to create an in-lane repeating shuffle mask and then shuffle the
12412 // the results into the target lanes.
12413 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12414 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12417 if (V2.isUndef()) {
12418 // There are no generalized cross-lane shuffle operations available on i16
12420 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12421 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12424 SmallVector<int, 8> RepeatedMask;
12425 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12426 // As this is a single-input shuffle, the repeated mask should be
12427 // a strictly valid v8i16 mask that we can pass through to the v8i16
12428 // lowering to handle even the v16 case.
12429 return lowerV8I16GeneralSingleInputVectorShuffle(
12430 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12434 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12435 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12438 // AVX512BWVL can lower to VPERMW.
12439 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12440 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12442 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12444 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12445 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12448 // Otherwise fall back on generic lowering.
12449 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12452 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12454 /// This routine is only called when we have AVX2 and thus a reasonable
12455 /// instruction set for v32i8 shuffling..
12456 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12457 const SmallBitVector &Zeroable,
12458 SDValue V1, SDValue V2,
12459 const X86Subtarget &Subtarget,
12460 SelectionDAG &DAG) {
12461 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12462 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12463 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12464 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12466 // Whenever we can lower this as a zext, that instruction is strictly faster
12467 // than any alternative. It also allows us to fold memory operands into the
12468 // shuffle in many cases.
12469 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12470 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12473 // Check for being able to broadcast a single element.
12474 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12475 Mask, Subtarget, DAG))
12478 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12479 Zeroable, Subtarget, DAG))
12482 // Use dedicated unpack instructions for masks that match their pattern.
12484 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12487 // Try to use shift instructions.
12488 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12489 Zeroable, Subtarget, DAG))
12492 // Try to use byte rotation instructions.
12493 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12494 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12497 // Try to create an in-lane repeating shuffle mask and then shuffle the
12498 // the results into the target lanes.
12499 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12500 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12503 // There are no generalized cross-lane shuffle operations available on i8
12505 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12506 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
12509 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12510 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12513 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12515 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12516 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12519 // Otherwise fall back on generic lowering.
12520 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
12523 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
12525 /// This routine either breaks down the specific type of a 256-bit x86 vector
12526 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
12527 /// together based on the available instructions.
12528 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12529 MVT VT, SDValue V1, SDValue V2,
12530 const SmallBitVector &Zeroable,
12531 const X86Subtarget &Subtarget,
12532 SelectionDAG &DAG) {
12533 // If we have a single input to the zero element, insert that into V1 if we
12534 // can do so cheaply.
12535 int NumElts = VT.getVectorNumElements();
12536 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12538 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12539 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12540 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12543 // Handle special cases where the lower or upper half is UNDEF.
12545 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
12548 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
12549 // can check for those subtargets here and avoid much of the subtarget
12550 // querying in the per-vector-type lowering routines. With AVX1 we have
12551 // essentially *zero* ability to manipulate a 256-bit vector with integer
12552 // types. Since we'll use floating point types there eventually, just
12553 // immediately cast everything to a float and operate entirely in that domain.
12554 if (VT.isInteger() && !Subtarget.hasAVX2()) {
12555 int ElementBits = VT.getScalarSizeInBits();
12556 if (ElementBits < 32) {
12557 // No floating point type available, if we can't use the bit operations
12558 // for masking/blending then decompose into 128-bit vectors.
12560 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
12562 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12564 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12567 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
12568 VT.getVectorNumElements());
12569 V1 = DAG.getBitcast(FpVT, V1);
12570 V2 = DAG.getBitcast(FpVT, V2);
12571 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
12574 switch (VT.SimpleTy) {
12576 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12578 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12580 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12582 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12584 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12586 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12589 llvm_unreachable("Not a valid 256-bit x86 vector type!");
12593 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
12594 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
12595 ArrayRef<int> Mask, SDValue V1,
12596 SDValue V2, SelectionDAG &DAG) {
12597 assert(VT.getScalarSizeInBits() == 64 &&
12598 "Unexpected element type size for 128bit shuffle.");
12600 // To handle 256 bit vector requires VLX and most probably
12601 // function lowerV2X128VectorShuffle() is better solution.
12602 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
12604 SmallVector<int, 4> WidenedMask;
12605 if (!canWidenShuffleElements(Mask, WidenedMask))
12608 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12609 // Insure elements came from the same Op.
12610 int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
12611 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
12612 if (WidenedMask[i] == SM_SentinelZero)
12614 if (WidenedMask[i] == SM_SentinelUndef)
12617 SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
12618 unsigned OpIndex = (i < Size/2) ? 0 : 1;
12619 if (Ops[OpIndex].isUndef())
12621 else if (Ops[OpIndex] != Op)
12625 // Form a 128-bit permutation.
12626 // Convert the 64-bit shuffle mask selection values into 128-bit selection
12627 // bits defined by a vshuf64x2 instruction's immediate control byte.
12628 unsigned PermMask = 0, Imm = 0;
12629 unsigned ControlBitsNum = WidenedMask.size() / 2;
12631 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
12632 // Use first element in place of undef mask.
12633 Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
12634 PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
12637 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
12638 DAG.getConstant(PermMask, DL, MVT::i8));
12641 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
12642 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12643 SDValue V1, SDValue V2,
12644 const X86Subtarget &Subtarget,
12645 SelectionDAG &DAG) {
12646 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12647 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12648 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12650 if (V2.isUndef()) {
12651 // Use low duplicate instructions for masks that match their pattern.
12652 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
12653 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
12655 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
12656 // Non-half-crossing single input shuffles can be lowered with an
12657 // interleaved permutation.
12658 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12659 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
12660 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
12661 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
12662 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
12663 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12666 SmallVector<int, 4> RepeatedMask;
12667 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
12668 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
12669 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12672 if (SDValue Shuf128 =
12673 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
12676 if (SDValue Unpck =
12677 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
12680 // Check if the blend happens to exactly fit that of SHUFPD.
12682 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
12685 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
12688 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
12689 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
12690 SDValue V1, SDValue V2,
12691 const X86Subtarget &Subtarget,
12692 SelectionDAG &DAG) {
12693 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12694 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12695 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12697 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12698 // options to efficiently lower the shuffle.
12699 SmallVector<int, 4> RepeatedMask;
12700 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
12701 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12703 // Use even/odd duplicate instructions for masks that match their pattern.
12704 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12705 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
12706 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12707 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
12710 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
12711 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12713 // Use dedicated unpack instructions for masks that match their pattern.
12714 if (SDValue Unpck =
12715 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
12718 // Otherwise, fall back to a SHUFPS sequence.
12719 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
12722 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
12725 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
12726 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12727 const SmallBitVector &Zeroable,
12728 SDValue V1, SDValue V2,
12729 const X86Subtarget &Subtarget,
12730 SelectionDAG &DAG) {
12731 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12732 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12733 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12735 if (SDValue Shuf128 =
12736 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
12739 if (V2.isUndef()) {
12740 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12741 // can use lower latency instructions that will operate on all four
12743 SmallVector<int, 2> Repeated128Mask;
12744 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
12745 SmallVector<int, 4> PSHUFDMask;
12746 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
12747 return DAG.getBitcast(
12749 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
12750 DAG.getBitcast(MVT::v16i32, V1),
12751 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12754 SmallVector<int, 4> Repeated256Mask;
12755 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
12756 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
12757 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
12760 // Try to use shift instructions.
12761 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
12762 Zeroable, Subtarget, DAG))
12765 // Try to use VALIGN.
12766 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
12767 Mask, Subtarget, DAG))
12770 // Try to use PALIGNR.
12771 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
12772 Mask, Subtarget, DAG))
12775 if (SDValue Unpck =
12776 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
12779 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
12782 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
12783 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12784 const SmallBitVector &Zeroable,
12785 SDValue V1, SDValue V2,
12786 const X86Subtarget &Subtarget,
12787 SelectionDAG &DAG) {
12788 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12789 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12790 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12792 // Whenever we can lower this as a zext, that instruction is strictly faster
12793 // than any alternative. It also allows us to fold memory operands into the
12794 // shuffle in many cases.
12795 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12796 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12799 // If the shuffle mask is repeated in each 128-bit lane we can use more
12800 // efficient instructions that mirror the shuffles across the four 128-bit
12802 SmallVector<int, 4> RepeatedMask;
12803 bool Is128BitLaneRepeatedShuffle =
12804 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
12805 if (Is128BitLaneRepeatedShuffle) {
12806 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12808 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
12809 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12811 // Use dedicated unpack instructions for masks that match their pattern.
12813 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
12817 // Try to use shift instructions.
12818 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
12819 Zeroable, Subtarget, DAG))
12822 // Try to use VALIGN.
12823 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
12824 Mask, Subtarget, DAG))
12827 // Try to use byte rotation instructions.
12828 if (Subtarget.hasBWI())
12829 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12830 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
12833 // Assume that a single SHUFPS is faster than using a permv shuffle.
12834 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12835 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12836 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
12837 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
12838 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
12839 CastV1, CastV2, DAG);
12840 return DAG.getBitcast(MVT::v16i32, ShufPS);
12843 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
12846 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
12847 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12848 const SmallBitVector &Zeroable,
12849 SDValue V1, SDValue V2,
12850 const X86Subtarget &Subtarget,
12851 SelectionDAG &DAG) {
12852 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12853 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12854 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12855 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
12857 // Whenever we can lower this as a zext, that instruction is strictly faster
12858 // than any alternative. It also allows us to fold memory operands into the
12859 // shuffle in many cases.
12860 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12861 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12864 // Use dedicated unpack instructions for masks that match their pattern.
12866 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
12869 // Try to use shift instructions.
12870 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
12871 Zeroable, Subtarget, DAG))
12874 // Try to use byte rotation instructions.
12875 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12876 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
12879 if (V2.isUndef()) {
12880 SmallVector<int, 8> RepeatedMask;
12881 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
12882 // As this is a single-input shuffle, the repeated mask should be
12883 // a strictly valid v8i16 mask that we can pass through to the v8i16
12884 // lowering to handle even the v32 case.
12885 return lowerV8I16GeneralSingleInputVectorShuffle(
12886 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
12890 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
12893 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
12894 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12895 const SmallBitVector &Zeroable,
12896 SDValue V1, SDValue V2,
12897 const X86Subtarget &Subtarget,
12898 SelectionDAG &DAG) {
12899 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12900 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12901 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
12902 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
12904 // Whenever we can lower this as a zext, that instruction is strictly faster
12905 // than any alternative. It also allows us to fold memory operands into the
12906 // shuffle in many cases.
12907 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12908 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12911 // Use dedicated unpack instructions for masks that match their pattern.
12913 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
12916 // Try to use shift instructions.
12917 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
12918 Zeroable, Subtarget, DAG))
12921 // Try to use byte rotation instructions.
12922 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12923 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12926 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12927 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12930 // VBMI can use VPERMV/VPERMV3 byte shuffles.
12931 if (Subtarget.hasVBMI())
12932 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
12934 // FIXME: Implement direct support for this type!
12935 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12938 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12940 /// This routine either breaks down the specific type of a 512-bit x86 vector
12941 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
12942 /// together based on the available instructions.
12943 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12944 MVT VT, SDValue V1, SDValue V2,
12945 const SmallBitVector &Zeroable,
12946 const X86Subtarget &Subtarget,
12947 SelectionDAG &DAG) {
12948 assert(Subtarget.hasAVX512() &&
12949 "Cannot lower 512-bit vectors w/ basic ISA!");
12951 // If we have a single input to the zero element, insert that into V1 if we
12952 // can do so cheaply.
12953 int NumElts = Mask.size();
12954 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12956 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12957 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12958 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12961 // Check for being able to broadcast a single element.
12962 if (SDValue Broadcast =
12963 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
12966 // Dispatch to each element type for lowering. If we don't have support for
12967 // specific element type shuffles at 512 bits, immediately split them and
12968 // lower them. Each lowering routine of a given type is allowed to assume that
12969 // the requisite ISA extensions for that element type are available.
12970 switch (VT.SimpleTy) {
12972 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12974 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12976 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12978 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12980 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12982 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12985 llvm_unreachable("Not a valid 512-bit x86 vector type!");
12989 // Lower vXi1 vector shuffles.
12990 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
12991 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
12992 // vector, shuffle and then truncate it back.
12993 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12994 MVT VT, SDValue V1, SDValue V2,
12995 const X86Subtarget &Subtarget,
12996 SelectionDAG &DAG) {
12997 assert(Subtarget.hasAVX512() &&
12998 "Cannot lower 512-bit vectors w/o basic ISA!");
13000 switch (VT.SimpleTy) {
13002 llvm_unreachable("Expected a vector of i1 elements");
13004 ExtVT = MVT::v2i64;
13007 ExtVT = MVT::v4i32;
13010 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13013 ExtVT = MVT::v16i32;
13016 ExtVT = MVT::v32i16;
13019 ExtVT = MVT::v64i8;
13023 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13024 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13025 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13026 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13028 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13031 V2 = DAG.getUNDEF(ExtVT);
13032 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13033 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13034 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13035 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13037 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13039 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13040 // i1 was sign extended we can use X86ISD::CVT2MASK.
13041 int NumElems = VT.getVectorNumElements();
13042 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13043 (Subtarget.hasDQI() && (NumElems < 32)))
13044 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13046 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13049 /// Helper function that returns true if the shuffle mask should be
13050 /// commuted to improve canonicalization.
13051 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13052 int NumElements = Mask.size();
13054 int NumV1Elements = 0, NumV2Elements = 0, NumSentinelElements = 0;
13057 ++NumSentinelElements;
13058 else if (M < NumElements)
13063 // Commute the shuffle as needed such that more elements come from V1 than
13064 // V2. This allows us to match the shuffle pattern strictly on how many
13065 // elements come from V1 without handling the symmetric cases.
13066 if (NumV2Elements > NumV1Elements)
13069 assert(NumV1Elements > 0 && "No V1 indices");
13071 if (NumV2Elements == 0)
13074 // When the number of V1 and V2 elements are the same, try to minimize the
13075 // number of uses of V2 in the low half of the vector. When that is tied,
13076 // ensure that the sum of indices for V1 is equal to or lower than the sum
13077 // indices for V2. When those are equal, try to ensure that the number of odd
13078 // indices for V1 is lower than the number of odd indices for V2.
13079 if (NumV1Elements == NumV2Elements) {
13080 int LowV1Elements = 0, LowV2Elements = 0;
13081 for (int M : Mask.slice(0, NumElements / 2))
13082 if (M >= NumElements)
13086 if (LowV2Elements > LowV1Elements)
13088 if (LowV2Elements == LowV1Elements) {
13089 int SumV1Indices = 0, SumV2Indices = 0;
13090 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13091 if (Mask[i] >= NumElements)
13093 else if (Mask[i] >= 0)
13095 if (SumV2Indices < SumV1Indices)
13097 if (SumV2Indices == SumV1Indices) {
13098 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13099 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13100 if (Mask[i] >= NumElements)
13101 NumV2OddIndices += i % 2;
13102 else if (Mask[i] >= 0)
13103 NumV1OddIndices += i % 2;
13104 if (NumV2OddIndices < NumV1OddIndices)
13113 /// \brief Top-level lowering for x86 vector shuffles.
13115 /// This handles decomposition, canonicalization, and lowering of all x86
13116 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13117 /// above in helper routines. The canonicalization attempts to widen shuffles
13118 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13119 /// s.t. only one of the two inputs needs to be tested, etc.
13120 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13121 SelectionDAG &DAG) {
13122 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13123 ArrayRef<int> Mask = SVOp->getMask();
13124 SDValue V1 = Op.getOperand(0);
13125 SDValue V2 = Op.getOperand(1);
13126 MVT VT = Op.getSimpleValueType();
13127 int NumElements = VT.getVectorNumElements();
13129 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13131 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13132 "Can't lower MMX shuffles");
13134 bool V1IsUndef = V1.isUndef();
13135 bool V2IsUndef = V2.isUndef();
13136 if (V1IsUndef && V2IsUndef)
13137 return DAG.getUNDEF(VT);
13139 // When we create a shuffle node we put the UNDEF node to second operand,
13140 // but in some cases the first operand may be transformed to UNDEF.
13141 // In this case we should just commute the node.
13143 return DAG.getCommutedVectorShuffle(*SVOp);
13145 // Check for non-undef masks pointing at an undef vector and make the masks
13146 // undef as well. This makes it easier to match the shuffle based solely on
13150 if (M >= NumElements) {
13151 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13152 for (int &M : NewMask)
13153 if (M >= NumElements)
13155 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13158 // Check for illegal shuffle mask element index values.
13159 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13160 assert(llvm::all_of(Mask,
13161 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13162 "Out of bounds shuffle index");
13164 // We actually see shuffles that are entirely re-arrangements of a set of
13165 // zero inputs. This mostly happens while decomposing complex shuffles into
13166 // simple ones. Directly lower these as a buildvector of zeros.
13167 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13168 if (Zeroable.all())
13169 return getZeroVector(VT, Subtarget, DAG, DL);
13171 // Try to collapse shuffles into using a vector type with fewer elements but
13172 // wider element types. We cap this to not form integers or floating point
13173 // elements wider than 64 bits, but it might be interesting to form i128
13174 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13175 SmallVector<int, 16> WidenedMask;
13176 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13177 canWidenShuffleElements(Mask, WidenedMask)) {
13178 MVT NewEltVT = VT.isFloatingPoint()
13179 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13180 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13181 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13182 // Make sure that the new vector type is legal. For example, v2f64 isn't
13184 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13185 V1 = DAG.getBitcast(NewVT, V1);
13186 V2 = DAG.getBitcast(NewVT, V2);
13187 return DAG.getBitcast(
13188 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13192 // Commute the shuffle if it will improve canonicalization.
13193 if (canonicalizeShuffleMaskWithCommute(Mask))
13194 return DAG.getCommutedVectorShuffle(*SVOp);
13196 // For each vector width, delegate to a specialized lowering routine.
13197 if (VT.is128BitVector())
13198 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13201 if (VT.is256BitVector())
13202 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13205 if (VT.is512BitVector())
13206 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13210 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13212 llvm_unreachable("Unimplemented!");
13215 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13216 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13217 const X86Subtarget &Subtarget,
13218 SelectionDAG &DAG) {
13219 SDValue Cond = Op.getOperand(0);
13220 SDValue LHS = Op.getOperand(1);
13221 SDValue RHS = Op.getOperand(2);
13223 MVT VT = Op.getSimpleValueType();
13225 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13227 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13229 // Only non-legal VSELECTs reach this lowering, convert those into generic
13230 // shuffles and re-use the shuffle lowering path for blends.
13231 SmallVector<int, 32> Mask;
13232 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13233 SDValue CondElt = CondBV->getOperand(i);
13235 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13238 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13241 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13242 // A vselect where all conditions and data are constants can be optimized into
13243 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13244 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13245 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13246 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13249 // Try to lower this to a blend-style vector shuffle. This can handle all
13250 // constant condition cases.
13251 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13254 // Variable blends are only legal from SSE4.1 onward.
13255 if (!Subtarget.hasSSE41())
13258 // Only some types will be legal on some subtargets. If we can emit a legal
13259 // VSELECT-matching blend, return Op, and but if we need to expand, return
13261 switch (Op.getSimpleValueType().SimpleTy) {
13263 // Most of the vector types have blends past SSE4.1.
13267 // The byte blends for AVX vectors were introduced only in AVX2.
13268 if (Subtarget.hasAVX2())
13275 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13276 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13279 // FIXME: We should custom lower this by fixing the condition and using i8
13285 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13286 MVT VT = Op.getSimpleValueType();
13289 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13292 if (VT.getSizeInBits() == 8) {
13293 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13294 Op.getOperand(0), Op.getOperand(1));
13295 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13296 DAG.getValueType(VT));
13297 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13300 if (VT == MVT::f32) {
13301 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13302 // the result back to FR32 register. It's only worth matching if the
13303 // result has a single use which is a store or a bitcast to i32. And in
13304 // the case of a store, it's not worth it if the index is a constant 0,
13305 // because a MOVSSmr can be used instead, which is smaller and faster.
13306 if (!Op.hasOneUse())
13308 SDNode *User = *Op.getNode()->use_begin();
13309 if ((User->getOpcode() != ISD::STORE ||
13310 isNullConstant(Op.getOperand(1))) &&
13311 (User->getOpcode() != ISD::BITCAST ||
13312 User->getValueType(0) != MVT::i32))
13314 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13315 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13317 return DAG.getBitcast(MVT::f32, Extract);
13320 if (VT == MVT::i32 || VT == MVT::i64) {
13321 // ExtractPS/pextrq works with constant index.
13322 if (isa<ConstantSDNode>(Op.getOperand(1)))
13329 /// Extract one bit from mask vector, like v16i1 or v8i1.
13330 /// AVX-512 feature.
13332 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13333 SDValue Vec = Op.getOperand(0);
13335 MVT VecVT = Vec.getSimpleValueType();
13336 SDValue Idx = Op.getOperand(1);
13337 MVT EltVT = Op.getSimpleValueType();
13339 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13340 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13341 "Unexpected vector type in ExtractBitFromMaskVector");
13343 // variable index can't be handled in mask registers,
13344 // extend vector to VR512
13345 if (!isa<ConstantSDNode>(Idx)) {
13346 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13347 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13348 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13349 ExtVT.getVectorElementType(), Ext, Idx);
13350 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13353 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13354 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13355 (VecVT.getVectorNumElements() < 8)) {
13356 // Use kshiftlw/rw instruction.
13357 VecVT = MVT::v16i1;
13358 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13359 DAG.getUNDEF(VecVT),
13361 DAG.getIntPtrConstant(0, dl));
13363 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13364 if (MaxSift - IdxVal)
13365 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13366 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13367 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13368 DAG.getConstant(MaxSift, dl, MVT::i8));
13369 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13370 DAG.getIntPtrConstant(0, dl));
13374 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13375 SelectionDAG &DAG) const {
13377 SDValue Vec = Op.getOperand(0);
13378 MVT VecVT = Vec.getSimpleValueType();
13379 SDValue Idx = Op.getOperand(1);
13381 if (Op.getSimpleValueType() == MVT::i1)
13382 return ExtractBitFromMaskVector(Op, DAG);
13384 if (!isa<ConstantSDNode>(Idx)) {
13385 if (VecVT.is512BitVector() ||
13386 (VecVT.is256BitVector() && Subtarget.hasInt256() &&
13387 VecVT.getScalarSizeInBits() == 32)) {
13390 MVT::getIntegerVT(VecVT.getScalarSizeInBits());
13391 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13392 MaskEltVT.getSizeInBits());
13394 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13395 auto PtrVT = getPointerTy(DAG.getDataLayout());
13396 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13397 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
13398 DAG.getConstant(0, dl, PtrVT));
13399 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13400 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
13401 DAG.getConstant(0, dl, PtrVT));
13406 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13408 // If this is a 256-bit vector result, first extract the 128-bit vector and
13409 // then extract the element from the 128-bit vector.
13410 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13411 // Get the 128-bit vector.
13412 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
13413 MVT EltVT = VecVT.getVectorElementType();
13415 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13416 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
13418 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
13419 // this can be done with a mask.
13420 IdxVal &= ElemsPerChunk - 1;
13421 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13422 DAG.getConstant(IdxVal, dl, MVT::i32));
13425 assert(VecVT.is128BitVector() && "Unexpected vector length");
13427 MVT VT = Op.getSimpleValueType();
13429 if (VT.getSizeInBits() == 16) {
13430 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
13431 // we're going to zero extend the register or fold the store (SSE41 only).
13432 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
13433 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
13434 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13435 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13436 DAG.getBitcast(MVT::v4i32, Vec), Idx));
13438 // Transform it so it match pextrw which produces a 32-bit result.
13439 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13440 Op.getOperand(0), Op.getOperand(1));
13441 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13442 DAG.getValueType(VT));
13443 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13446 if (Subtarget.hasSSE41())
13447 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
13450 // TODO: handle v16i8.
13452 if (VT.getSizeInBits() == 32) {
13456 // SHUFPS the element to the lowest double word, then movss.
13457 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
13458 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13459 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13460 DAG.getIntPtrConstant(0, dl));
13463 if (VT.getSizeInBits() == 64) {
13464 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13465 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13466 // to match extract_elt for f64.
13470 // UNPCKHPD the element to the lowest double word, then movsd.
13471 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13472 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13473 int Mask[2] = { 1, -1 };
13474 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13475 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13476 DAG.getIntPtrConstant(0, dl));
13482 /// Insert one bit to mask vector, like v16i1 or v8i1.
13483 /// AVX-512 feature.
13485 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13487 SDValue Vec = Op.getOperand(0);
13488 SDValue Elt = Op.getOperand(1);
13489 SDValue Idx = Op.getOperand(2);
13490 MVT VecVT = Vec.getSimpleValueType();
13492 if (!isa<ConstantSDNode>(Idx)) {
13493 // Non constant index. Extend source and destination,
13494 // insert element and then truncate the result.
13495 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13496 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13497 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13498 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13499 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13500 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13503 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13504 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13505 unsigned NumElems = VecVT.getVectorNumElements();
13507 if(Vec.isUndef()) {
13509 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13510 DAG.getConstant(IdxVal, dl, MVT::i8));
13514 // Insertion of one bit into first or last position
13515 // can be done with two SHIFTs + OR.
13516 if (IdxVal == 0 ) {
13517 // EltInVec already at correct index and other bits are 0.
13518 // Clean the first bit in source vector.
13519 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13520 DAG.getConstant(1 , dl, MVT::i8));
13521 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13522 DAG.getConstant(1, dl, MVT::i8));
13524 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13526 if (IdxVal == NumElems -1) {
13527 // Move the bit to the last position inside the vector.
13528 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13529 DAG.getConstant(IdxVal, dl, MVT::i8));
13530 // Clean the last bit in the source vector.
13531 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13532 DAG.getConstant(1, dl, MVT::i8));
13533 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13534 DAG.getConstant(1 , dl, MVT::i8));
13536 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13539 // Use shuffle to insert element.
13540 SmallVector<int, 64> MaskVec(NumElems);
13541 for (unsigned i = 0; i != NumElems; ++i)
13542 MaskVec[i] = (i == IdxVal) ? NumElems : i;
13544 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
13547 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13548 SelectionDAG &DAG) const {
13549 MVT VT = Op.getSimpleValueType();
13550 MVT EltVT = VT.getVectorElementType();
13551 unsigned NumElts = VT.getVectorNumElements();
13553 if (EltVT == MVT::i1)
13554 return InsertBitToMaskVector(Op, DAG);
13557 SDValue N0 = Op.getOperand(0);
13558 SDValue N1 = Op.getOperand(1);
13559 SDValue N2 = Op.getOperand(2);
13560 if (!isa<ConstantSDNode>(N2))
13562 auto *N2C = cast<ConstantSDNode>(N2);
13563 unsigned IdxVal = N2C->getZExtValue();
13565 // If we are clearing out a element, we do this more efficiently with a
13566 // blend shuffle than a costly integer insertion.
13567 // TODO: would other rematerializable values (e.g. allbits) benefit as well?
13568 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
13569 // be beneficial if we are inserting several zeros and can combine the masks.
13570 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
13571 SmallVector<int, 8> ClearMask;
13572 for (unsigned i = 0; i != NumElts; ++i)
13573 ClearMask.push_back(i == IdxVal ? i + NumElts : i);
13574 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
13575 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
13578 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13579 // into that, and then insert the subvector back into the result.
13580 if (VT.is256BitVector() || VT.is512BitVector()) {
13581 // With a 256-bit vector, we can insert into the zero element efficiently
13582 // using a blend if we have AVX or AVX2 and the right data type.
13583 if (VT.is256BitVector() && IdxVal == 0) {
13584 // TODO: It is worthwhile to cast integer to floating point and back
13585 // and incur a domain crossing penalty if that's what we'll end up
13586 // doing anyway after extracting to a 128-bit vector.
13587 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13588 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
13589 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
13590 N2 = DAG.getIntPtrConstant(1, dl);
13591 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
13595 // Get the desired 128-bit vector chunk.
13596 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
13598 // Insert the element into the desired chunk.
13599 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13600 assert(isPowerOf2_32(NumEltsIn128));
13601 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
13602 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
13604 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13605 DAG.getConstant(IdxIn128, dl, MVT::i32));
13607 // Insert the changed part back into the bigger vector
13608 return insert128BitVector(N0, V, IdxVal, DAG, dl);
13610 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13612 if (Subtarget.hasSSE41()) {
13613 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13615 if (VT == MVT::v8i16) {
13616 Opc = X86ISD::PINSRW;
13618 assert(VT == MVT::v16i8);
13619 Opc = X86ISD::PINSRB;
13622 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13624 if (N1.getValueType() != MVT::i32)
13625 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13626 if (N2.getValueType() != MVT::i32)
13627 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13628 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13631 if (EltVT == MVT::f32) {
13632 // Bits [7:6] of the constant are the source select. This will always be
13633 // zero here. The DAG Combiner may combine an extract_elt index into
13634 // these bits. For example (insert (extract, 3), 2) could be matched by
13635 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
13636 // Bits [5:4] of the constant are the destination select. This is the
13637 // value of the incoming immediate.
13638 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13639 // combine either bitwise AND or insert of float 0.0 to set these bits.
13641 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
13642 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
13643 // If this is an insertion of 32-bits into the low 32-bits of
13644 // a vector, we prefer to generate a blend with immediate rather
13645 // than an insertps. Blends are simpler operations in hardware and so
13646 // will always have equal or better performance than insertps.
13647 // But if optimizing for size and there's a load folding opportunity,
13648 // generate insertps because blendps does not have a 32-bit memory
13650 N2 = DAG.getIntPtrConstant(1, dl);
13651 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13652 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
13654 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
13655 // Create this as a scalar to vector..
13656 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13657 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13660 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13661 // PINSR* works with constant index.
13666 if (EltVT == MVT::i8)
13669 if (EltVT.getSizeInBits() == 16) {
13670 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13671 // as its second argument.
13672 if (N1.getValueType() != MVT::i32)
13673 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13674 if (N2.getValueType() != MVT::i32)
13675 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13676 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13681 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13683 MVT OpVT = Op.getSimpleValueType();
13685 // If this is a 256-bit vector result, first insert into a 128-bit
13686 // vector and then insert into the 256-bit vector.
13687 if (!OpVT.is128BitVector()) {
13688 // Insert into a 128-bit vector.
13689 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13690 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13691 OpVT.getVectorNumElements() / SizeFactor);
13693 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13695 // Insert the 128-bit vector.
13696 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13699 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13700 assert(OpVT.is128BitVector() && "Expected an SSE type!");
13701 return DAG.getBitcast(
13702 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
13705 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13706 // a simple subregister reference or explicit instructions to grab
13707 // upper bits of a vector.
13708 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13709 SelectionDAG &DAG) {
13710 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
13713 SDValue In = Op.getOperand(0);
13714 SDValue Idx = Op.getOperand(1);
13715 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13716 MVT ResVT = Op.getSimpleValueType();
13718 assert((In.getSimpleValueType().is256BitVector() ||
13719 In.getSimpleValueType().is512BitVector()) &&
13720 "Can only extract from 256-bit or 512-bit vectors");
13722 if (ResVT.is128BitVector())
13723 return extract128BitVector(In, IdxVal, DAG, dl);
13724 if (ResVT.is256BitVector())
13725 return extract256BitVector(In, IdxVal, DAG, dl);
13727 llvm_unreachable("Unimplemented!");
13730 static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
13731 for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
13732 if (llvm::all_of(ValidUsers,
13733 [&I](SDValue V) { return V.getNode() != *I; }))
13738 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13739 // simple superregister reference or explicit instructions to insert
13740 // the upper bits of a vector.
13741 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13742 SelectionDAG &DAG) {
13743 assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
13746 SDValue Vec = Op.getOperand(0);
13747 SDValue SubVec = Op.getOperand(1);
13748 SDValue Idx = Op.getOperand(2);
13750 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13751 MVT OpVT = Op.getSimpleValueType();
13752 MVT SubVecVT = SubVec.getSimpleValueType();
13754 if (OpVT.getVectorElementType() == MVT::i1)
13755 return insert1BitVector(Op, DAG, Subtarget);
13757 assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13758 "Can only insert into 256-bit or 512-bit vectors");
13760 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
13762 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13763 // (load16 addr + 16), Elts/2)
13766 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13767 // (load32 addr + 32), Elts/2)
13769 // or a 16-byte or 32-byte broadcast:
13770 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13771 // (load16 addr), Elts/2)
13772 // --> X86SubVBroadcast(load16 addr)
13774 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13775 // (load32 addr), Elts/2)
13776 // --> X86SubVBroadcast(load32 addr)
13777 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13778 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13779 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
13780 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
13781 if (Idx2 && Idx2->getZExtValue() == 0) {
13782 SDValue SubVec2 = Vec.getOperand(1);
13783 // If needed, look through bitcasts to get to the load.
13784 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
13786 unsigned Alignment = FirstLd->getAlignment();
13787 unsigned AS = FirstLd->getAddressSpace();
13788 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
13789 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
13790 OpVT, AS, Alignment, &Fast) && Fast) {
13791 SDValue Ops[] = {SubVec2, SubVec};
13792 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
13796 // If lower/upper loads are the same and the only users of the load, then
13797 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
13798 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
13799 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
13800 areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
13801 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
13804 // If this is subv_broadcast insert into both halves, use a larger
13806 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
13807 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
13808 SubVec.getOperand(0));
13813 if (SubVecVT.is128BitVector())
13814 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13816 if (SubVecVT.is256BitVector())
13817 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13819 llvm_unreachable("Unimplemented!");
13822 // Returns the appropriate wrapper opcode for a global reference.
13823 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
13824 // References to absolute symbols are never PC-relative.
13825 if (GV && GV->isAbsoluteSymbolRef())
13826 return X86ISD::Wrapper;
13828 CodeModel::Model M = getTargetMachine().getCodeModel();
13829 if (Subtarget.isPICStyleRIPRel() &&
13830 (M == CodeModel::Small || M == CodeModel::Kernel))
13831 return X86ISD::WrapperRIP;
13833 return X86ISD::Wrapper;
13836 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13837 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13838 // one of the above mentioned nodes. It has to be wrapped because otherwise
13839 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13840 // be used to form addressing mode. These wrapped nodes will be selected
13843 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13844 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13846 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13847 // global base reg.
13848 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13850 auto PtrVT = getPointerTy(DAG.getDataLayout());
13851 SDValue Result = DAG.getTargetConstantPool(
13852 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
13854 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13855 // With PIC, the address is actually $g + Offset.
13858 DAG.getNode(ISD::ADD, DL, PtrVT,
13859 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13865 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13866 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13868 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13869 // global base reg.
13870 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13872 auto PtrVT = getPointerTy(DAG.getDataLayout());
13873 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
13875 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13877 // With PIC, the address is actually $g + Offset.
13880 DAG.getNode(ISD::ADD, DL, PtrVT,
13881 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13887 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13888 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13890 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13891 // global base reg.
13892 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
13893 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
13895 auto PtrVT = getPointerTy(DAG.getDataLayout());
13896 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
13899 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13901 // With PIC, the address is actually $g + Offset.
13902 if (isPositionIndependent() && !Subtarget.is64Bit()) {
13904 DAG.getNode(ISD::ADD, DL, PtrVT,
13905 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13908 // For symbols that require a load from a stub to get the address, emit the
13910 if (isGlobalStubReference(OpFlag))
13911 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
13912 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13918 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13919 // Create the TargetBlockAddressAddress node.
13920 unsigned char OpFlags =
13921 Subtarget.classifyBlockAddressReference();
13922 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13923 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13925 auto PtrVT = getPointerTy(DAG.getDataLayout());
13926 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
13927 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
13929 // With PIC, the address is actually $g + Offset.
13930 if (isGlobalRelativeToPICBase(OpFlags)) {
13931 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
13932 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
13938 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
13939 const SDLoc &dl, int64_t Offset,
13940 SelectionDAG &DAG) const {
13941 // Create the TargetGlobalAddress node, folding in the constant
13942 // offset if it is legal.
13943 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
13944 CodeModel::Model M = DAG.getTarget().getCodeModel();
13945 auto PtrVT = getPointerTy(DAG.getDataLayout());
13947 if (OpFlags == X86II::MO_NO_FLAG &&
13948 X86::isOffsetSuitableForCodeModel(Offset, M)) {
13949 // A direct static reference to a global.
13950 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
13953 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
13956 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
13958 // With PIC, the address is actually $g + Offset.
13959 if (isGlobalRelativeToPICBase(OpFlags)) {
13960 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
13961 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
13964 // For globals that require a load from a stub to get the address, emit the
13966 if (isGlobalStubReference(OpFlags))
13967 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
13968 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13970 // If there was a non-zero offset that we didn't fold, create an explicit
13971 // addition for it.
13973 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
13974 DAG.getConstant(Offset, dl, PtrVT));
13980 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13981 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13982 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13983 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13987 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13988 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13989 unsigned char OperandFlags, bool LocalDynamic = false) {
13990 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
13991 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13993 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13994 GA->getValueType(0),
13998 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14002 SDValue Ops[] = { Chain, TGA, *InFlag };
14003 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14005 SDValue Ops[] = { Chain, TGA };
14006 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14009 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14010 MFI.setAdjustsStack(true);
14011 MFI.setHasCalls(true);
14013 SDValue Flag = Chain.getValue(1);
14014 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14017 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14019 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14022 SDLoc dl(GA); // ? function entry point might be better
14023 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14024 DAG.getNode(X86ISD::GlobalBaseReg,
14025 SDLoc(), PtrVT), InFlag);
14026 InFlag = Chain.getValue(1);
14028 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14031 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14033 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14035 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14036 X86::RAX, X86II::MO_TLSGD);
14039 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14045 // Get the start address of the TLS block for this module.
14046 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14047 .getInfo<X86MachineFunctionInfo>();
14048 MFI->incNumLocalDynamicTLSAccesses();
14052 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14053 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14056 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14057 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14058 InFlag = Chain.getValue(1);
14059 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14060 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14063 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14067 unsigned char OperandFlags = X86II::MO_DTPOFF;
14068 unsigned WrapperKind = X86ISD::Wrapper;
14069 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14070 GA->getValueType(0),
14071 GA->getOffset(), OperandFlags);
14072 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14074 // Add x@dtpoff with the base.
14075 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14078 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14079 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14080 const EVT PtrVT, TLSModel::Model model,
14081 bool is64Bit, bool isPIC) {
14084 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14085 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14086 is64Bit ? 257 : 256));
14088 SDValue ThreadPointer =
14089 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14090 MachinePointerInfo(Ptr));
14092 unsigned char OperandFlags = 0;
14093 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14095 unsigned WrapperKind = X86ISD::Wrapper;
14096 if (model == TLSModel::LocalExec) {
14097 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14098 } else if (model == TLSModel::InitialExec) {
14100 OperandFlags = X86II::MO_GOTTPOFF;
14101 WrapperKind = X86ISD::WrapperRIP;
14103 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14106 llvm_unreachable("Unexpected model");
14109 // emit "addl x@ntpoff,%eax" (local exec)
14110 // or "addl x@indntpoff,%eax" (initial exec)
14111 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14113 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14114 GA->getOffset(), OperandFlags);
14115 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14117 if (model == TLSModel::InitialExec) {
14118 if (isPIC && !is64Bit) {
14119 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14120 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14124 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14125 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14128 // The address of the thread local variable is the add of the thread
14129 // pointer with the offset of the variable.
14130 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14134 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14136 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14138 if (DAG.getTarget().Options.EmulatedTLS)
14139 return LowerToTLSEmulatedModel(GA, DAG);
14141 const GlobalValue *GV = GA->getGlobal();
14142 auto PtrVT = getPointerTy(DAG.getDataLayout());
14143 bool PositionIndependent = isPositionIndependent();
14145 if (Subtarget.isTargetELF()) {
14146 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14148 case TLSModel::GeneralDynamic:
14149 if (Subtarget.is64Bit())
14150 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14151 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14152 case TLSModel::LocalDynamic:
14153 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14154 Subtarget.is64Bit());
14155 case TLSModel::InitialExec:
14156 case TLSModel::LocalExec:
14157 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14158 PositionIndependent);
14160 llvm_unreachable("Unknown TLS model.");
14163 if (Subtarget.isTargetDarwin()) {
14164 // Darwin only has one model of TLS. Lower to that.
14165 unsigned char OpFlag = 0;
14166 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14167 X86ISD::WrapperRIP : X86ISD::Wrapper;
14169 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14170 // global base reg.
14171 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14173 OpFlag = X86II::MO_TLVP_PIC_BASE;
14175 OpFlag = X86II::MO_TLVP;
14177 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14178 GA->getValueType(0),
14179 GA->getOffset(), OpFlag);
14180 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14182 // With PIC32, the address is actually $g + Offset.
14184 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14185 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14188 // Lowering the machine isd will make sure everything is in the right
14190 SDValue Chain = DAG.getEntryNode();
14191 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14192 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14193 SDValue Args[] = { Chain, Offset };
14194 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14195 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14196 DAG.getIntPtrConstant(0, DL, true),
14197 Chain.getValue(1), DL);
14199 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14200 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14201 MFI.setAdjustsStack(true);
14203 // And our return value (tls address) is in the standard call return value
14205 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14206 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14209 if (Subtarget.isTargetKnownWindowsMSVC() ||
14210 Subtarget.isTargetWindowsItanium() ||
14211 Subtarget.isTargetWindowsGNU()) {
14212 // Just use the implicit TLS architecture
14213 // Need to generate someting similar to:
14214 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14216 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14217 // mov rcx, qword [rdx+rcx*8]
14218 // mov eax, .tls$:tlsvar
14219 // [rax+rcx] contains the address
14220 // Windows 64bit: gs:0x58
14221 // Windows 32bit: fs:__tls_array
14224 SDValue Chain = DAG.getEntryNode();
14226 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14227 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14228 // use its literal value of 0x2C.
14229 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14230 ? Type::getInt8PtrTy(*DAG.getContext(),
14232 : Type::getInt32PtrTy(*DAG.getContext(),
14235 SDValue TlsArray = Subtarget.is64Bit()
14236 ? DAG.getIntPtrConstant(0x58, dl)
14237 : (Subtarget.isTargetWindowsGNU()
14238 ? DAG.getIntPtrConstant(0x2C, dl)
14239 : DAG.getExternalSymbol("_tls_array", PtrVT));
14241 SDValue ThreadPointer =
14242 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14245 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14246 res = ThreadPointer;
14248 // Load the _tls_index variable
14249 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14250 if (Subtarget.is64Bit())
14251 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14252 MachinePointerInfo(), MVT::i32);
14254 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14256 auto &DL = DAG.getDataLayout();
14258 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14259 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14261 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14264 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14266 // Get the offset of start of .tls section
14267 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14268 GA->getValueType(0),
14269 GA->getOffset(), X86II::MO_SECREL);
14270 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14272 // The address of the thread local variable is the add of the thread
14273 // pointer with the offset of the variable.
14274 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14277 llvm_unreachable("TLS not implemented for this target.");
14280 /// Lower SRA_PARTS and friends, which return two i32 values
14281 /// and take a 2 x i32 value to shift plus a shift amount.
14282 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14283 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14284 MVT VT = Op.getSimpleValueType();
14285 unsigned VTBits = VT.getSizeInBits();
14287 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14288 SDValue ShOpLo = Op.getOperand(0);
14289 SDValue ShOpHi = Op.getOperand(1);
14290 SDValue ShAmt = Op.getOperand(2);
14291 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14292 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14294 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14295 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14296 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14297 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14298 : DAG.getConstant(0, dl, VT);
14300 SDValue Tmp2, Tmp3;
14301 if (Op.getOpcode() == ISD::SHL_PARTS) {
14302 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14303 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14305 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14306 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14309 // If the shift amount is larger or equal than the width of a part we can't
14310 // rely on the results of shld/shrd. Insert a test and select the appropriate
14311 // values for large shift amounts.
14312 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14313 DAG.getConstant(VTBits, dl, MVT::i8));
14314 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14315 AndNode, DAG.getConstant(0, dl, MVT::i8));
14318 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14319 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14320 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14322 if (Op.getOpcode() == ISD::SHL_PARTS) {
14323 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14324 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14326 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14327 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14330 SDValue Ops[2] = { Lo, Hi };
14331 return DAG.getMergeValues(Ops, dl);
14334 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14335 SelectionDAG &DAG) const {
14336 SDValue Src = Op.getOperand(0);
14337 MVT SrcVT = Src.getSimpleValueType();
14338 MVT VT = Op.getSimpleValueType();
14341 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14342 if (SrcVT.isVector()) {
14343 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14344 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14345 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14346 DAG.getUNDEF(SrcVT)));
14348 if (SrcVT.getVectorElementType() == MVT::i1) {
14349 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14350 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14351 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14352 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14353 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14354 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14359 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14360 "Unknown SINT_TO_FP to lower!");
14362 // These are really Legal; return the operand so the caller accepts it as
14364 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14366 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14367 Subtarget.is64Bit()) {
14371 SDValue ValueToStore = Op.getOperand(0);
14372 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14373 !Subtarget.is64Bit())
14374 // Bitcasting to f64 here allows us to do a single 64-bit store from
14375 // an SSE register, avoiding the store forwarding penalty that would come
14376 // with two 32-bit stores.
14377 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14379 unsigned Size = SrcVT.getSizeInBits()/8;
14380 MachineFunction &MF = DAG.getMachineFunction();
14381 auto PtrVT = getPointerTy(MF.getDataLayout());
14382 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14383 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14384 SDValue Chain = DAG.getStore(
14385 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14386 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14387 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14390 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14392 SelectionDAG &DAG) const {
14396 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14398 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14400 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14402 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14404 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14405 MachineMemOperand *MMO;
14407 int SSFI = FI->getIndex();
14408 MMO = DAG.getMachineFunction().getMachineMemOperand(
14409 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14410 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14412 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14413 StackSlot = StackSlot.getOperand(1);
14415 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14416 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14418 Tys, Ops, SrcVT, MMO);
14421 Chain = Result.getValue(1);
14422 SDValue InFlag = Result.getValue(2);
14424 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14425 // shouldn't be necessary except that RFP cannot be live across
14426 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14427 MachineFunction &MF = DAG.getMachineFunction();
14428 unsigned SSFISize = Op.getValueSizeInBits()/8;
14429 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
14430 auto PtrVT = getPointerTy(MF.getDataLayout());
14431 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14432 Tys = DAG.getVTList(MVT::Other);
14434 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14436 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14437 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14438 MachineMemOperand::MOStore, SSFISize, SSFISize);
14440 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14441 Ops, Op.getValueType(), MMO);
14442 Result = DAG.getLoad(
14443 Op.getValueType(), DL, Chain, StackSlot,
14444 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14450 /// 64-bit unsigned integer to double expansion.
14451 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14452 SelectionDAG &DAG) const {
14453 // This algorithm is not obvious. Here it is what we're trying to output:
14456 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14457 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14459 haddpd %xmm0, %xmm0
14461 pshufd $0x4e, %xmm0, %xmm1
14467 LLVMContext *Context = DAG.getContext();
14469 // Build some magic constants.
14470 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14471 Constant *C0 = ConstantDataVector::get(*Context, CV0);
14472 auto PtrVT = getPointerTy(DAG.getDataLayout());
14473 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
14475 SmallVector<Constant*,2> CV1;
14477 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14478 APInt(64, 0x4330000000000000ULL))));
14480 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14481 APInt(64, 0x4530000000000000ULL))));
14482 Constant *C1 = ConstantVector::get(CV1);
14483 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
14485 // Load the 64-bit value into an XMM register.
14486 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14489 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14490 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14491 /* Alignment = */ 16);
14493 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
14496 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14497 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14498 /* Alignment = */ 16);
14499 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
14500 // TODO: Are there any fast-math-flags to propagate here?
14501 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14504 if (Subtarget.hasSSE3()) {
14505 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14506 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14508 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
14509 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
14510 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14511 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
14514 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14515 DAG.getIntPtrConstant(0, dl));
14518 /// 32-bit unsigned integer to float expansion.
14519 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14520 SelectionDAG &DAG) const {
14522 // FP constant to bias correct the final result.
14523 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
14526 // Load the 32-bit value into an XMM register.
14527 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14530 // Zero out the upper parts of the register.
14531 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14533 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14534 DAG.getBitcast(MVT::v2f64, Load),
14535 DAG.getIntPtrConstant(0, dl));
14537 // Or the load with the bias.
14538 SDValue Or = DAG.getNode(
14539 ISD::OR, dl, MVT::v2i64,
14540 DAG.getBitcast(MVT::v2i64,
14541 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
14542 DAG.getBitcast(MVT::v2i64,
14543 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
14545 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14546 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
14548 // Subtract the bias.
14549 // TODO: Are there any fast-math-flags to propagate here?
14550 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14552 // Handle final rounding.
14553 MVT DestVT = Op.getSimpleValueType();
14555 if (DestVT.bitsLT(MVT::f64))
14556 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14557 DAG.getIntPtrConstant(0, dl));
14558 if (DestVT.bitsGT(MVT::f64))
14559 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14561 // Handle final rounding.
14565 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
14566 const X86Subtarget &Subtarget, SDLoc &DL) {
14567 if (Op.getSimpleValueType() != MVT::v2f64)
14570 SDValue N0 = Op.getOperand(0);
14571 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
14573 // Legalize to v4i32 type.
14574 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
14575 DAG.getUNDEF(MVT::v2i32));
14577 if (Subtarget.hasAVX512())
14578 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
14580 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
14581 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
14582 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
14583 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
14585 // Two to the power of half-word-size.
14586 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
14588 // Clear upper part of LO, lower HI.
14589 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
14590 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
14592 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
14593 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
14594 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
14596 // Add the two halves.
14597 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
14600 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14601 const X86Subtarget &Subtarget) {
14602 // The algorithm is the following:
14603 // #ifdef __SSE4_1__
14604 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14605 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14606 // (uint4) 0x53000000, 0xaa);
14608 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14609 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14611 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14612 // return (float4) lo + fhi;
14614 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
14615 // reassociate the two FADDs, and if we do that, the algorithm fails
14616 // spectacularly (PR24512).
14617 // FIXME: If we ever have some kind of Machine FMF, this should be marked
14618 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
14619 // there's also the MachineCombiner reassociations happening on Machine IR.
14620 if (DAG.getTarget().Options.UnsafeFPMath)
14624 SDValue V = Op->getOperand(0);
14625 MVT VecIntVT = V.getSimpleValueType();
14626 bool Is128 = VecIntVT == MVT::v4i32;
14627 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14628 // If we convert to something else than the supported type, e.g., to v4f64,
14630 if (VecFloatVT != Op->getSimpleValueType(0))
14633 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14634 "Unsupported custom type");
14636 // In the #idef/#else code, we have in common:
14637 // - The vector of constants:
14643 // Create the splat vector for 0x4b000000.
14644 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
14645 // Create the splat vector for 0x53000000.
14646 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
14648 // Create the right shift.
14649 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
14650 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14653 if (Subtarget.hasSSE41()) {
14654 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14655 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14656 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
14657 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
14658 // Low will be bitcasted right away, so do not bother bitcasting back to its
14660 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14661 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14662 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14663 // (uint4) 0x53000000, 0xaa);
14664 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
14665 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
14666 // High will be bitcasted right away, so do not bother bitcasting back to
14667 // its original type.
14668 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14669 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14671 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
14672 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14673 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14674 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14676 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14677 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14680 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14681 SDValue VecCstFAdd = DAG.getConstantFP(
14682 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
14684 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14685 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
14686 // TODO: Are there any fast-math-flags to propagate here?
14688 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14689 // return (float4) lo + fhi;
14690 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
14691 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14694 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14695 SelectionDAG &DAG) const {
14696 SDValue N0 = Op.getOperand(0);
14697 MVT SrcVT = N0.getSimpleValueType();
14700 if (SrcVT.getVectorElementType() == MVT::i1) {
14701 if (SrcVT == MVT::v2i1)
14702 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14703 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
14704 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14705 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14706 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
14709 switch (SrcVT.SimpleTy) {
14711 llvm_unreachable("Custom UINT_TO_FP is not supported!");
14716 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14717 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14718 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14721 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
14724 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
14727 assert(Subtarget.hasAVX512());
14728 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14729 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
14733 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14734 SelectionDAG &DAG) const {
14735 SDValue N0 = Op.getOperand(0);
14737 auto PtrVT = getPointerTy(DAG.getDataLayout());
14739 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14740 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14741 // the optimization here.
14742 if (DAG.SignBitIsZero(N0))
14743 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14745 if (Op.getSimpleValueType().isVector())
14746 return lowerUINT_TO_FP_vec(Op, DAG);
14748 MVT SrcVT = N0.getSimpleValueType();
14749 MVT DstVT = Op.getSimpleValueType();
14751 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
14752 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
14753 // Conversions from unsigned i32 to f32/f64 are legal,
14754 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
14758 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14759 return LowerUINT_TO_FP_i64(Op, DAG);
14760 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14761 return LowerUINT_TO_FP_i32(Op, DAG);
14762 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14765 // Make a 64-bit buffer, and use it to build an FILD.
14766 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14767 if (SrcVT == MVT::i32) {
14768 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
14769 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14770 StackSlot, MachinePointerInfo());
14771 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
14772 OffsetSlot, MachinePointerInfo());
14773 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14777 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14778 SDValue ValueToStore = Op.getOperand(0);
14779 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
14780 // Bitcasting to f64 here allows us to do a single 64-bit store from
14781 // an SSE register, avoiding the store forwarding penalty that would come
14782 // with two 32-bit stores.
14783 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14784 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14785 MachinePointerInfo());
14786 // For i64 source, we need to add the appropriate power of 2 if the input
14787 // was negative. This is the same as the optimization in
14788 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14789 // we must be careful to do the computation in x87 extended precision, not
14790 // in SSE. (The generic code can't know it's OK to do this, or how to.)
14791 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14792 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14793 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14794 MachineMemOperand::MOLoad, 8, 8);
14796 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14797 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14798 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14801 APInt FF(32, 0x5F800000ULL);
14803 // Check whether the sign bit is set.
14804 SDValue SignSet = DAG.getSetCC(
14805 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
14806 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
14808 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14809 SDValue FudgePtr = DAG.getConstantPool(
14810 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
14812 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14813 SDValue Zero = DAG.getIntPtrConstant(0, dl);
14814 SDValue Four = DAG.getIntPtrConstant(4, dl);
14815 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14817 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
14819 // Load the value out, extending it from f32 to f80.
14820 // FIXME: Avoid the extend by constructing the right constant pool?
14821 SDValue Fudge = DAG.getExtLoad(
14822 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
14823 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
14824 /* Alignment = */ 4);
14825 // Extend everything to 80 bits to force it to be done on x87.
14826 // TODO: Are there any fast-math-flags to propagate here?
14827 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14828 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
14829 DAG.getIntPtrConstant(0, dl));
14832 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
14833 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
14834 // just return an <SDValue(), SDValue()> pair.
14835 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
14836 // to i16, i32 or i64, and we lower it to a legal sequence.
14837 // If lowered to the final integer result we return a <result, SDValue()> pair.
14838 // Otherwise we lower it to a sequence ending with a FIST, return a
14839 // <FIST, StackSlot> pair, and the caller is responsible for loading
14840 // the final integer result from StackSlot.
14841 std::pair<SDValue,SDValue>
14842 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14843 bool IsSigned, bool IsReplace) const {
14846 EVT DstTy = Op.getValueType();
14847 EVT TheVT = Op.getOperand(0).getValueType();
14848 auto PtrVT = getPointerTy(DAG.getDataLayout());
14850 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
14851 // f16 must be promoted before using the lowering in this routine.
14852 // fp128 does not use this lowering.
14853 return std::make_pair(SDValue(), SDValue());
14856 // If using FIST to compute an unsigned i64, we'll need some fixup
14857 // to handle values above the maximum signed i64. A FIST is always
14858 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
14859 bool UnsignedFixup = !IsSigned &&
14860 DstTy == MVT::i64 &&
14861 (!Subtarget.is64Bit() ||
14862 !isScalarFPTypeInSSEReg(TheVT));
14864 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
14865 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
14866 // The low 32 bits of the fist result will have the correct uint32 result.
14867 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14871 assert(DstTy.getSimpleVT() <= MVT::i64 &&
14872 DstTy.getSimpleVT() >= MVT::i16 &&
14873 "Unknown FP_TO_INT to lower!");
14875 // These are really Legal.
14876 if (DstTy == MVT::i32 &&
14877 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14878 return std::make_pair(SDValue(), SDValue());
14879 if (Subtarget.is64Bit() &&
14880 DstTy == MVT::i64 &&
14881 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14882 return std::make_pair(SDValue(), SDValue());
14884 // We lower FP->int64 into FISTP64 followed by a load from a temporary
14886 MachineFunction &MF = DAG.getMachineFunction();
14887 unsigned MemSize = DstTy.getSizeInBits()/8;
14888 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
14889 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14892 switch (DstTy.getSimpleVT().SimpleTy) {
14893 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14894 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14895 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14896 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14899 SDValue Chain = DAG.getEntryNode();
14900 SDValue Value = Op.getOperand(0);
14901 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
14903 if (UnsignedFixup) {
14905 // Conversion to unsigned i64 is implemented with a select,
14906 // depending on whether the source value fits in the range
14907 // of a signed i64. Let Thresh be the FP equivalent of
14908 // 0x8000000000000000ULL.
14910 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
14911 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
14912 // Fist-to-mem64 FistSrc
14913 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
14914 // to XOR'ing the high 32 bits with Adjust.
14916 // Being a power of 2, Thresh is exactly representable in all FP formats.
14917 // For X87 we'd like to use the smallest FP type for this constant, but
14918 // for DAG type consistency we have to match the FP operand type.
14920 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
14921 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
14922 bool LosesInfo = false;
14923 if (TheVT == MVT::f64)
14924 // The rounding mode is irrelevant as the conversion should be exact.
14925 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
14927 else if (TheVT == MVT::f80)
14928 Status = Thresh.convert(APFloat::x87DoubleExtended(),
14929 APFloat::rmNearestTiesToEven, &LosesInfo);
14931 assert(Status == APFloat::opOK && !LosesInfo &&
14932 "FP conversion should have been exact");
14934 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
14936 SDValue Cmp = DAG.getSetCC(DL,
14937 getSetCCResultType(DAG.getDataLayout(),
14938 *DAG.getContext(), TheVT),
14939 Value, ThreshVal, ISD::SETLT);
14940 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
14941 DAG.getConstant(0, DL, MVT::i32),
14942 DAG.getConstant(0x80000000, DL, MVT::i32));
14943 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
14944 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
14945 *DAG.getContext(), TheVT),
14946 Value, ThreshVal, ISD::SETLT);
14947 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
14950 // FIXME This causes a redundant load/store if the SSE-class value is already
14951 // in memory, such as if it is on the callstack.
14952 if (isScalarFPTypeInSSEReg(TheVT)) {
14953 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14954 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14955 MachinePointerInfo::getFixedStack(MF, SSFI));
14956 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14958 Chain, StackSlot, DAG.getValueType(TheVT)
14961 MachineMemOperand *MMO =
14962 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
14963 MachineMemOperand::MOLoad, MemSize, MemSize);
14964 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14965 Chain = Value.getValue(1);
14966 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
14967 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14970 MachineMemOperand *MMO =
14971 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
14972 MachineMemOperand::MOStore, MemSize, MemSize);
14974 if (UnsignedFixup) {
14976 // Insert the FIST, load its result as two i32's,
14977 // and XOR the high i32 with Adjust.
14979 SDValue FistOps[] = { Chain, Value, StackSlot };
14980 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14981 FistOps, DstTy, MMO);
14984 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
14985 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
14988 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
14989 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
14991 if (Subtarget.is64Bit()) {
14992 // Join High32 and Low32 into a 64-bit result.
14993 // (High32 << 32) | Low32
14994 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
14995 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
14996 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
14997 DAG.getConstant(32, DL, MVT::i8));
14998 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
14999 return std::make_pair(Result, SDValue());
15002 SDValue ResultOps[] = { Low32, High32 };
15004 SDValue pair = IsReplace
15005 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15006 : DAG.getMergeValues(ResultOps, DL);
15007 return std::make_pair(pair, SDValue());
15009 // Build the FP_TO_INT*_IN_MEM
15010 SDValue Ops[] = { Chain, Value, StackSlot };
15011 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15013 return std::make_pair(FIST, StackSlot);
15017 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15018 const X86Subtarget &Subtarget) {
15019 MVT VT = Op->getSimpleValueType(0);
15020 SDValue In = Op->getOperand(0);
15021 MVT InVT = In.getSimpleValueType();
15024 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15025 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15027 // Optimize vectors in AVX mode:
15030 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15031 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15032 // Concat upper and lower parts.
15035 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15036 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15037 // Concat upper and lower parts.
15040 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15041 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15042 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15045 if (Subtarget.hasInt256())
15046 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15048 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15049 SDValue Undef = DAG.getUNDEF(InVT);
15050 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15051 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15052 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15054 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15055 VT.getVectorNumElements()/2);
15057 OpLo = DAG.getBitcast(HVT, OpLo);
15058 OpHi = DAG.getBitcast(HVT, OpHi);
15060 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15063 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15064 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15065 MVT VT = Op->getSimpleValueType(0);
15066 SDValue In = Op->getOperand(0);
15067 MVT InVT = In.getSimpleValueType();
15069 unsigned NumElts = VT.getVectorNumElements();
15070 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
15073 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
15074 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15076 assert(InVT.getVectorElementType() == MVT::i1);
15078 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15080 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15081 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15084 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15086 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15088 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15090 return SelectedVal;
15091 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15094 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15095 SelectionDAG &DAG) {
15096 if (Subtarget.hasFp256())
15097 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15103 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15104 SelectionDAG &DAG) {
15106 MVT VT = Op.getSimpleValueType();
15107 SDValue In = Op.getOperand(0);
15108 MVT SVT = In.getSimpleValueType();
15110 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15111 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15113 if (Subtarget.hasFp256())
15114 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15117 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15118 VT.getVectorNumElements() != SVT.getVectorNumElements());
15122 /// Helper to recursively truncate vector elements in half with PACKSS.
15123 /// It makes use of the fact that vector comparison results will be all-zeros
15124 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15125 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15126 /// within each 128-bit lane.
15127 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15130 const X86Subtarget &Subtarget) {
15131 // Requires SSE2 but AVX512 has fast truncate.
15132 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15135 EVT SrcVT = In.getValueType();
15137 // No truncation required, we might get here due to recursive calls.
15138 if (SrcVT == DstVT)
15141 // We only support vector truncation to 128bits or greater from a
15142 // 256bits or greater source.
15143 if ((DstVT.getSizeInBits() % 128) != 0)
15145 if ((SrcVT.getSizeInBits() % 256) != 0)
15148 unsigned NumElems = SrcVT.getVectorNumElements();
15149 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15150 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15153 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15155 // Extract lower/upper subvectors.
15156 unsigned NumSubElts = NumElems / 2;
15157 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15158 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15159 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15161 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15162 if (SrcVT.is256BitVector()) {
15163 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15164 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15165 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15166 return DAG.getBitcast(DstVT, Res);
15169 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15170 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15171 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15172 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15173 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15174 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15176 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15177 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15178 Res = DAG.getBitcast(MVT::v4i64, Res);
15179 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15181 if (DstVT.is256BitVector())
15182 return DAG.getBitcast(DstVT, Res);
15184 // If 512bit -> 128bit truncate another stage.
15185 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15186 Res = DAG.getBitcast(PackedVT, Res);
15187 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15190 // Recursively pack lower/upper subvectors, concat result and pack again.
15191 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15192 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15193 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15194 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15196 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15197 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15198 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15201 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15202 const X86Subtarget &Subtarget) {
15205 MVT VT = Op.getSimpleValueType();
15206 SDValue In = Op.getOperand(0);
15207 MVT InVT = In.getSimpleValueType();
15209 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15211 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15212 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15213 if (InVT.getScalarSizeInBits() <= 16) {
15214 if (Subtarget.hasBWI()) {
15215 // legal, will go to VPMOVB2M, VPMOVW2M
15216 // Shift packed bytes not supported natively, bitcast to word
15217 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15218 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15219 DAG.getBitcast(ExtVT, In),
15220 DAG.getConstant(ShiftInx, DL, ExtVT));
15221 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15222 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15224 // Use TESTD/Q, extended vector to packed dword/qword.
15225 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15226 "Unexpected vector type.");
15227 unsigned NumElts = InVT.getVectorNumElements();
15228 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15229 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15231 ShiftInx = InVT.getScalarSizeInBits() - 1;
15234 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15235 DAG.getConstant(ShiftInx, DL, InVT));
15236 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15239 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15241 MVT VT = Op.getSimpleValueType();
15242 SDValue In = Op.getOperand(0);
15243 MVT InVT = In.getSimpleValueType();
15245 if (VT == MVT::i1) {
15246 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15247 "Invalid scalar TRUNCATE operation");
15248 if (InVT.getSizeInBits() >= 32)
15250 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15251 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15253 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15254 "Invalid TRUNCATE operation");
15256 if (VT.getVectorElementType() == MVT::i1)
15257 return LowerTruncateVecI1(Op, DAG, Subtarget);
15259 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15260 if (Subtarget.hasAVX512()) {
15261 // word to byte only under BWI
15262 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15263 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15264 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
15265 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15268 // Truncate with PACKSS if we are truncating a vector comparison result.
15269 // TODO: We should be able to support other operations as long as we
15270 // we are saturating+packing zero/all bits only.
15271 auto IsPackableComparison = [](SDValue V) {
15272 unsigned Opcode = V.getOpcode();
15273 return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
15274 Opcode == X86ISD::CMPP);
15277 if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
15278 all_of(In->ops(), IsPackableComparison))) {
15279 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15283 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15284 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15285 if (Subtarget.hasInt256()) {
15286 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15287 In = DAG.getBitcast(MVT::v8i32, In);
15288 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
15290 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15291 DAG.getIntPtrConstant(0, DL));
15294 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15295 DAG.getIntPtrConstant(0, DL));
15296 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15297 DAG.getIntPtrConstant(2, DL));
15298 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15299 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15300 static const int ShufMask[] = {0, 2, 4, 6};
15301 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15304 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15305 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
15306 if (Subtarget.hasInt256()) {
15307 In = DAG.getBitcast(MVT::v32i8, In);
15309 SmallVector<SDValue,32> pshufbMask;
15310 for (unsigned i = 0; i < 2; ++i) {
15311 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
15312 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
15313 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
15314 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
15315 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
15316 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
15317 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
15318 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
15319 for (unsigned j = 0; j < 8; ++j)
15320 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
15322 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
15323 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
15324 In = DAG.getBitcast(MVT::v4i64, In);
15326 static const int ShufMask[] = {0, 2, -1, -1};
15327 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
15329 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15330 DAG.getIntPtrConstant(0, DL));
15331 return DAG.getBitcast(VT, In);
15334 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15335 DAG.getIntPtrConstant(0, DL));
15337 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15338 DAG.getIntPtrConstant(4, DL));
15340 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15341 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15343 // The PSHUFB mask:
15344 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15345 -1, -1, -1, -1, -1, -1, -1, -1};
15347 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
15348 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
15349 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
15351 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15352 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15354 // The MOVLHPS Mask:
15355 static const int ShufMask2[] = {0, 1, 4, 5};
15356 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15357 return DAG.getBitcast(MVT::v8i16, res);
15360 // Handle truncation of V256 to V128 using shuffles.
15361 if (!VT.is128BitVector() || !InVT.is256BitVector())
15364 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15366 unsigned NumElems = VT.getVectorNumElements();
15367 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15369 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15370 // Prepare truncation shuffle mask
15371 for (unsigned i = 0; i != NumElems; ++i)
15372 MaskVec[i] = i * 2;
15373 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
15374 DAG.getUNDEF(NVT), MaskVec);
15375 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15376 DAG.getIntPtrConstant(0, DL));
15379 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
15380 const X86Subtarget &Subtarget,
15381 SelectionDAG &DAG) const {
15382 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15384 MVT VT = Op.getSimpleValueType();
15386 if (VT.isVector()) {
15387 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15388 SDValue Src = Op.getOperand(0);
15390 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15391 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
15393 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15394 DAG.getUNDEF(MVT::v2f32)));
15400 assert(!VT.isVector());
15402 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15403 IsSigned, /*IsReplace=*/ false);
15404 SDValue FIST = Vals.first, StackSlot = Vals.second;
15405 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15406 if (!FIST.getNode())
15409 if (StackSlot.getNode())
15410 // Load the result.
15411 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15413 // The node is the result.
15417 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15419 MVT VT = Op.getSimpleValueType();
15420 SDValue In = Op.getOperand(0);
15421 MVT SVT = In.getSimpleValueType();
15423 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15425 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15426 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15427 In, DAG.getUNDEF(SVT)));
15430 /// The only differences between FABS and FNEG are the mask and the logic op.
15431 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15432 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15433 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15434 "Wrong opcode for lowering FABS or FNEG.");
15436 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15438 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15439 // into an FNABS. We'll lower the FABS after that if it is still in use.
15441 for (SDNode *User : Op->uses())
15442 if (User->getOpcode() == ISD::FNEG)
15446 MVT VT = Op.getSimpleValueType();
15448 bool IsF128 = (VT == MVT::f128);
15450 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15451 // decide if we should generate a 16-byte constant mask when we only need 4 or
15452 // 8 bytes for the scalar case.
15457 if (VT.isVector()) {
15459 EltVT = VT.getVectorElementType();
15460 } else if (IsF128) {
15461 // SSE instructions are used for optimized f128 logical operations.
15462 LogicVT = MVT::f128;
15465 // There are no scalar bitwise logical SSE/AVX instructions, so we
15466 // generate a 16-byte vector constant and logic op even for the scalar case.
15467 // Using a 16-byte mask allows folding the load of the mask with
15468 // the logic op, so it can save (~4 bytes) on code size.
15469 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15473 unsigned EltBits = EltVT.getSizeInBits();
15474 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
15476 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
15477 const fltSemantics &Sem =
15478 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
15479 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15480 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
15482 SDValue Op0 = Op.getOperand(0);
15483 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
15485 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15486 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15488 if (VT.isVector() || IsF128)
15489 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15491 // For the scalar case extend to a 128-bit vector, perform the logic op,
15492 // and extract the scalar result back out.
15493 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
15494 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15495 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
15496 DAG.getIntPtrConstant(0, dl));
15499 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
15500 SDValue Mag = Op.getOperand(0);
15501 SDValue Sign = Op.getOperand(1);
15504 // If the sign operand is smaller, extend it first.
15505 MVT VT = Op.getSimpleValueType();
15506 if (Sign.getSimpleValueType().bitsLT(VT))
15507 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
15509 // And if it is bigger, shrink it first.
15510 if (Sign.getSimpleValueType().bitsGT(VT))
15511 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
15513 // At this point the operands and the result should have the same
15514 // type, and that won't be f80 since that is not custom lowered.
15515 bool IsF128 = (VT == MVT::f128);
15516 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
15517 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
15518 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
15519 "Unexpected type in LowerFCOPYSIGN");
15521 MVT EltVT = VT.getScalarType();
15522 const fltSemantics &Sem =
15523 EltVT == MVT::f64 ? APFloat::IEEEdouble()
15524 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15526 // Perform all scalar logic operations as 16-byte vectors because there are no
15527 // scalar FP logic instructions in SSE.
15528 // TODO: This isn't necessary. If we used scalar types, we might avoid some
15529 // unnecessary splats, but we might miss load folding opportunities. Should
15530 // this decision be based on OptimizeForSize?
15531 bool IsFakeVector = !VT.isVector() && !IsF128;
15534 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15536 // The mask constants are automatically splatted for vector types.
15537 unsigned EltSizeInBits = VT.getScalarSizeInBits();
15538 SDValue SignMask = DAG.getConstantFP(
15539 APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15540 SDValue MagMask = DAG.getConstantFP(
15541 APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15543 // First, clear all bits but the sign bit from the second operand (sign).
15545 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
15546 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
15548 // Next, clear the sign bit from the first operand (magnitude).
15549 // TODO: If we had general constant folding for FP logic ops, this check
15550 // wouldn't be necessary.
15552 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
15553 APFloat APF = Op0CN->getValueAPF();
15555 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
15557 // If the magnitude operand wasn't a constant, we need to AND out the sign.
15559 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
15560 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
15563 // OR the magnitude value with the sign bit.
15564 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
15565 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
15566 DAG.getIntPtrConstant(0, dl));
15569 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
15570 SDValue N0 = Op.getOperand(0);
15572 MVT VT = Op.getSimpleValueType();
15574 MVT OpVT = N0.getSimpleValueType();
15575 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
15576 "Unexpected type for FGETSIGN");
15578 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
15579 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
15580 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
15581 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
15582 Res = DAG.getZExtOrTrunc(Res, dl, VT);
15583 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
15587 // Check whether an OR'd tree is PTEST-able.
15588 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
15589 SelectionDAG &DAG) {
15590 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
15592 if (!Subtarget.hasSSE41())
15595 if (!Op->hasOneUse())
15598 SDNode *N = Op.getNode();
15601 SmallVector<SDValue, 8> Opnds;
15602 DenseMap<SDValue, unsigned> VecInMap;
15603 SmallVector<SDValue, 8> VecIns;
15604 EVT VT = MVT::Other;
15606 // Recognize a special case where a vector is casted into wide integer to
15608 Opnds.push_back(N->getOperand(0));
15609 Opnds.push_back(N->getOperand(1));
15611 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
15612 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
15613 // BFS traverse all OR'd operands.
15614 if (I->getOpcode() == ISD::OR) {
15615 Opnds.push_back(I->getOperand(0));
15616 Opnds.push_back(I->getOperand(1));
15617 // Re-evaluate the number of nodes to be traversed.
15618 e += 2; // 2 more nodes (LHS and RHS) are pushed.
15622 // Quit if a non-EXTRACT_VECTOR_ELT
15623 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15626 // Quit if without a constant index.
15627 SDValue Idx = I->getOperand(1);
15628 if (!isa<ConstantSDNode>(Idx))
15631 SDValue ExtractedFromVec = I->getOperand(0);
15632 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
15633 if (M == VecInMap.end()) {
15634 VT = ExtractedFromVec.getValueType();
15635 // Quit if not 128/256-bit vector.
15636 if (!VT.is128BitVector() && !VT.is256BitVector())
15638 // Quit if not the same type.
15639 if (VecInMap.begin() != VecInMap.end() &&
15640 VT != VecInMap.begin()->first.getValueType())
15642 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
15643 VecIns.push_back(ExtractedFromVec);
15645 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
15648 assert((VT.is128BitVector() || VT.is256BitVector()) &&
15649 "Not extracted from 128-/256-bit vector.");
15651 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15653 for (DenseMap<SDValue, unsigned>::const_iterator
15654 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15655 // Quit if not all elements are used.
15656 if (I->second != FullMask)
15660 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15662 // Cast all vectors into TestVT for PTEST.
15663 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15664 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
15666 // If more than one full vectors are evaluated, OR them first before PTEST.
15667 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15668 // Each iteration will OR 2 nodes and append the result until there is only
15669 // 1 node left, i.e. the final OR'd value of all vectors.
15670 SDValue LHS = VecIns[Slot];
15671 SDValue RHS = VecIns[Slot + 1];
15672 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15675 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15676 VecIns.back(), VecIns.back());
15679 /// \brief return true if \c Op has a use that doesn't just read flags.
15680 static bool hasNonFlagsUse(SDValue Op) {
15681 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15683 SDNode *User = *UI;
15684 unsigned UOpNo = UI.getOperandNo();
15685 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15686 // Look pass truncate.
15687 UOpNo = User->use_begin().getOperandNo();
15688 User = *User->use_begin();
15691 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15692 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15698 // Emit KTEST instruction for bit vectors on AVX-512
15699 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
15700 const X86Subtarget &Subtarget) {
15701 if (Op.getOpcode() == ISD::BITCAST) {
15702 auto hasKTEST = [&](MVT VT) {
15703 unsigned SizeInBits = VT.getSizeInBits();
15704 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
15705 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
15707 SDValue Op0 = Op.getOperand(0);
15708 MVT Op0VT = Op0.getValueType().getSimpleVT();
15709 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
15711 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
15716 /// Emit nodes that will be selected as "test Op0,Op0", or something
15718 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
15719 SelectionDAG &DAG) const {
15720 if (Op.getValueType() == MVT::i1) {
15721 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15722 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15723 DAG.getConstant(0, dl, MVT::i8));
15725 // CF and OF aren't always set the way we want. Determine which
15726 // of these we need.
15727 bool NeedCF = false;
15728 bool NeedOF = false;
15731 case X86::COND_A: case X86::COND_AE:
15732 case X86::COND_B: case X86::COND_BE:
15735 case X86::COND_G: case X86::COND_GE:
15736 case X86::COND_L: case X86::COND_LE:
15737 case X86::COND_O: case X86::COND_NO: {
15738 // Check if we really need to set the
15739 // Overflow flag. If NoSignedWrap is present
15740 // that is not actually needed.
15741 switch (Op->getOpcode()) {
15746 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
15747 if (BinNode->Flags.hasNoSignedWrap())
15757 // See if we can use the EFLAGS value from the operand instead of
15758 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15759 // we prove that the arithmetic won't overflow, we can't use OF or CF.
15760 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15761 // Emit KTEST for bit vectors
15762 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15764 // Emit a CMP with 0, which is the TEST pattern.
15765 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15766 DAG.getConstant(0, dl, Op.getValueType()));
15768 unsigned Opcode = 0;
15769 unsigned NumOperands = 0;
15771 // Truncate operations may prevent the merge of the SETCC instruction
15772 // and the arithmetic instruction before it. Attempt to truncate the operands
15773 // of the arithmetic instruction and use a reduced bit-width instruction.
15774 bool NeedTruncation = false;
15775 SDValue ArithOp = Op;
15776 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15777 SDValue Arith = Op->getOperand(0);
15778 // Both the trunc and the arithmetic op need to have one user each.
15779 if (Arith->hasOneUse())
15780 switch (Arith.getOpcode()) {
15787 NeedTruncation = true;
15793 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15794 // which may be the result of a CAST. We use the variable 'Op', which is the
15795 // non-casted variable when we check for possible users.
15796 switch (ArithOp.getOpcode()) {
15798 // Due to an isel shortcoming, be conservative if this add is likely to be
15799 // selected as part of a load-modify-store instruction. When the root node
15800 // in a match is a store, isel doesn't know how to remap non-chain non-flag
15801 // uses of other nodes in the match, such as the ADD in this case. This
15802 // leads to the ADD being left around and reselected, with the result being
15803 // two adds in the output. Alas, even if none our users are stores, that
15804 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
15805 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
15806 // climbing the DAG back to the root, and it doesn't seem to be worth the
15808 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15809 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15810 if (UI->getOpcode() != ISD::CopyToReg &&
15811 UI->getOpcode() != ISD::SETCC &&
15812 UI->getOpcode() != ISD::STORE)
15815 if (ConstantSDNode *C =
15816 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
15817 // An add of one will be selected as an INC.
15818 if (C->isOne() && !Subtarget.slowIncDec()) {
15819 Opcode = X86ISD::INC;
15824 // An add of negative one (subtract of one) will be selected as a DEC.
15825 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
15826 Opcode = X86ISD::DEC;
15832 // Otherwise use a regular EFLAGS-setting add.
15833 Opcode = X86ISD::ADD;
15838 // If we have a constant logical shift that's only used in a comparison
15839 // against zero turn it into an equivalent AND. This allows turning it into
15840 // a TEST instruction later.
15841 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15842 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15843 EVT VT = Op.getValueType();
15844 unsigned BitWidth = VT.getSizeInBits();
15845 unsigned ShAmt = Op->getConstantOperandVal(1);
15846 if (ShAmt >= BitWidth) // Avoid undefined shifts.
15848 APInt Mask = ArithOp.getOpcode() == ISD::SRL
15849 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15850 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15851 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15853 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15854 DAG.getConstant(Mask, dl, VT));
15859 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
15860 // because a TEST instruction will be better.
15861 if (!hasNonFlagsUse(Op)) {
15862 SDValue Op0 = ArithOp->getOperand(0);
15863 SDValue Op1 = ArithOp->getOperand(1);
15864 EVT VT = ArithOp.getValueType();
15865 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
15866 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
15868 // But if we can combine this into an ANDN operation, then create an AND
15869 // now and allow it to be pattern matched into an ANDN.
15870 if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
15877 // Due to the ISEL shortcoming noted above, be conservative if this op is
15878 // likely to be selected as part of a load-modify-store instruction.
15879 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15880 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15881 if (UI->getOpcode() == ISD::STORE)
15884 // Otherwise use a regular EFLAGS-setting instruction.
15885 switch (ArithOp.getOpcode()) {
15886 default: llvm_unreachable("unexpected operator!");
15887 case ISD::SUB: Opcode = X86ISD::SUB; break;
15888 case ISD::XOR: Opcode = X86ISD::XOR; break;
15889 case ISD::AND: Opcode = X86ISD::AND; break;
15891 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15892 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
15895 Opcode = X86ISD::OR;
15909 return SDValue(Op.getNode(), 1);
15915 // If we found that truncation is beneficial, perform the truncation and
15917 if (NeedTruncation) {
15918 EVT VT = Op.getValueType();
15919 SDValue WideVal = Op->getOperand(0);
15920 EVT WideVT = WideVal.getValueType();
15921 unsigned ConvertedOp = 0;
15922 // Use a target machine opcode to prevent further DAGCombine
15923 // optimizations that may separate the arithmetic operations
15924 // from the setcc node.
15925 switch (WideVal.getOpcode()) {
15927 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15928 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15929 case ISD::AND: ConvertedOp = X86ISD::AND; break;
15930 case ISD::OR: ConvertedOp = X86ISD::OR; break;
15931 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15935 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15936 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15937 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15938 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15939 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15945 // Emit KTEST for bit vectors
15946 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15949 // Emit a CMP with 0, which is the TEST pattern.
15950 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15951 DAG.getConstant(0, dl, Op.getValueType()));
15953 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15954 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
15956 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15957 DAG.ReplaceAllUsesWith(Op, New);
15958 return SDValue(New.getNode(), 1);
15961 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15963 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15964 const SDLoc &dl, SelectionDAG &DAG) const {
15965 if (isNullConstant(Op1))
15966 return EmitTest(Op0, X86CC, dl, DAG);
15968 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
15969 "Unexpected comparison operation for MVT::i1 operands");
15971 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15972 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15973 // Only promote the compare up to I32 if it is a 16 bit operation
15974 // with an immediate. 16 bit immediates are to be avoided.
15975 if ((Op0.getValueType() == MVT::i16 &&
15976 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
15977 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
15978 !Subtarget.isAtom()) {
15979 unsigned ExtendOp =
15980 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15981 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15982 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15984 // Use SUB instead of CMP to enable CSE between SUB and CMP.
15985 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15986 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15988 return SDValue(Sub.getNode(), 1);
15990 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15993 /// Convert a comparison if required by the subtarget.
15994 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15995 SelectionDAG &DAG) const {
15996 // If the subtarget does not support the FUCOMI instruction, floating-point
15997 // comparisons have to be converted.
15998 if (Subtarget.hasCMov() ||
15999 Cmp.getOpcode() != X86ISD::CMP ||
16000 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16001 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16004 // The instruction selector will select an FUCOM instruction instead of
16005 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16006 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16007 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16009 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16010 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16011 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16012 DAG.getConstant(8, dl, MVT::i8));
16013 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16015 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16016 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16017 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16020 /// Check if replacement of SQRT with RSQRT should be disabled.
16021 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16022 EVT VT = Op.getValueType();
16024 // We never want to use both SQRT and RSQRT instructions for the same input.
16025 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16029 return Subtarget.hasFastVectorFSQRT();
16030 return Subtarget.hasFastScalarFSQRT();
16033 /// The minimum architected relative accuracy is 2^-12. We need one
16034 /// Newton-Raphson step to have a good float result (24 bits of precision).
16035 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16036 SelectionDAG &DAG, int Enabled,
16037 int &RefinementSteps,
16038 bool &UseOneConstNR,
16039 bool Reciprocal) const {
16040 EVT VT = Op.getValueType();
16042 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16043 // TODO: Add support for AVX512 (v16f32).
16044 // It is likely not profitable to do this for f64 because a double-precision
16045 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16046 // instructions: convert to single, rsqrtss, convert back to double, refine
16047 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16048 // along with FMA, this could be a throughput win.
16049 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16050 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16051 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16052 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16053 RefinementSteps = 1;
16055 UseOneConstNR = false;
16056 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16061 /// The minimum architected relative accuracy is 2^-12. We need one
16062 /// Newton-Raphson step to have a good float result (24 bits of precision).
16063 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16065 int &RefinementSteps) const {
16066 EVT VT = Op.getValueType();
16068 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16069 // TODO: Add support for AVX512 (v16f32).
16070 // It is likely not profitable to do this for f64 because a double-precision
16071 // reciprocal estimate with refinement on x86 prior to FMA requires
16072 // 15 instructions: convert to single, rcpss, convert back to double, refine
16073 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16074 // along with FMA, this could be a throughput win.
16076 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16077 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16078 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16079 // Enable estimate codegen with 1 refinement step for vector division.
16080 // Scalar division estimates are disabled because they break too much
16081 // real-world code. These defaults are intended to match GCC behavior.
16082 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16085 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16086 RefinementSteps = 1;
16088 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16093 /// If we have at least two divisions that use the same divisor, convert to
16094 /// multplication by a reciprocal. This may need to be adjusted for a given
16095 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16096 /// This is because we still need one division to calculate the reciprocal and
16097 /// then we need two multiplies by that reciprocal as replacements for the
16098 /// original divisions.
16099 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16103 /// Helper for creating a X86ISD::SETCC node.
16104 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16105 SelectionDAG &DAG) {
16106 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16107 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16110 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16111 /// according to equal/not-equal condition code \p CC.
16112 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16113 const SDLoc &dl, SelectionDAG &DAG) {
16114 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16115 // instruction. Since the shift amount is in-range-or-undefined, we know
16116 // that doing a bittest on the i32 value is ok. We extend to i32 because
16117 // the encoding for the i16 version is larger than the i32 version.
16118 // Also promote i16 to i32 for performance / code size reason.
16119 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16120 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16122 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16123 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16124 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16125 // known to be zero.
16126 if (Src.getValueType() == MVT::i64 &&
16127 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16128 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16130 // If the operand types disagree, extend the shift amount to match. Since
16131 // BT ignores high bits (like shifts) we can use anyextend.
16132 if (Src.getValueType() != BitNo.getValueType())
16133 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16135 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16136 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16137 return getSETCC(Cond, BT, dl , DAG);
16140 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16141 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16142 const SDLoc &dl, SelectionDAG &DAG) {
16143 SDValue Op0 = And.getOperand(0);
16144 SDValue Op1 = And.getOperand(1);
16145 if (Op0.getOpcode() == ISD::TRUNCATE)
16146 Op0 = Op0.getOperand(0);
16147 if (Op1.getOpcode() == ISD::TRUNCATE)
16148 Op1 = Op1.getOperand(0);
16151 if (Op1.getOpcode() == ISD::SHL)
16152 std::swap(Op0, Op1);
16153 if (Op0.getOpcode() == ISD::SHL) {
16154 if (isOneConstant(Op0.getOperand(0))) {
16155 // If we looked past a truncate, check that it's only truncating away
16157 unsigned BitWidth = Op0.getValueSizeInBits();
16158 unsigned AndBitWidth = And.getValueSizeInBits();
16159 if (BitWidth > AndBitWidth) {
16161 DAG.computeKnownBits(Op0, Zeros, Ones);
16162 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
16166 RHS = Op0.getOperand(1);
16168 } else if (Op1.getOpcode() == ISD::Constant) {
16169 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16170 uint64_t AndRHSVal = AndRHS->getZExtValue();
16171 SDValue AndLHS = Op0;
16173 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16174 LHS = AndLHS.getOperand(0);
16175 RHS = AndLHS.getOperand(1);
16178 // Use BT if the immediate can't be encoded in a TEST instruction.
16179 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16181 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16186 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16191 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16192 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16193 const SDLoc &dl, SelectionDAG &DAG) {
16195 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16196 "Expected TRUNCATE to i1 node");
16198 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16201 SDValue ShiftRight = Op.getOperand(0);
16202 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16206 /// Result of 'and' or 'trunc to i1' is compared against zero.
16207 /// Change to a BT node if possible.
16208 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16209 const SDLoc &dl, SelectionDAG &DAG) const {
16210 if (Op.getOpcode() == ISD::AND)
16211 return LowerAndToBT(Op, CC, dl, DAG);
16212 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16213 return LowerTruncateToBT(Op, CC, dl, DAG);
16217 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16219 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16224 // SSE Condition code mapping:
16233 switch (SetCCOpcode) {
16234 default: llvm_unreachable("Unexpected SETCC condition");
16236 case ISD::SETEQ: SSECC = 0; break;
16238 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16240 case ISD::SETOLT: SSECC = 1; break;
16242 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16244 case ISD::SETOLE: SSECC = 2; break;
16245 case ISD::SETUO: SSECC = 3; break;
16247 case ISD::SETNE: SSECC = 4; break;
16248 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16249 case ISD::SETUGE: SSECC = 5; break;
16250 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16251 case ISD::SETUGT: SSECC = 6; break;
16252 case ISD::SETO: SSECC = 7; break;
16254 case ISD::SETONE: SSECC = 8; break;
16257 std::swap(Op0, Op1);
16262 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16263 /// concatenate the result back.
16264 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16265 MVT VT = Op.getSimpleValueType();
16267 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16268 "Unsupported value type for operation");
16270 unsigned NumElems = VT.getVectorNumElements();
16272 SDValue CC = Op.getOperand(2);
16274 // Extract the LHS vectors
16275 SDValue LHS = Op.getOperand(0);
16276 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16277 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16279 // Extract the RHS vectors
16280 SDValue RHS = Op.getOperand(1);
16281 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16282 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16284 // Issue the operation on the smaller types and concatenate the result back
16285 MVT EltVT = VT.getVectorElementType();
16286 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16287 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16288 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16289 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16292 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16293 SDValue Op0 = Op.getOperand(0);
16294 SDValue Op1 = Op.getOperand(1);
16295 SDValue CC = Op.getOperand(2);
16296 MVT VT = Op.getSimpleValueType();
16299 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16300 "Unexpected type for boolean compare operation");
16301 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16302 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16303 DAG.getConstant(-1, dl, VT));
16304 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16305 DAG.getConstant(-1, dl, VT));
16306 switch (SetCCOpcode) {
16307 default: llvm_unreachable("Unexpected SETCC condition");
16309 // (x == y) -> ~(x ^ y)
16310 return DAG.getNode(ISD::XOR, dl, VT,
16311 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16312 DAG.getConstant(-1, dl, VT));
16314 // (x != y) -> (x ^ y)
16315 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16318 // (x > y) -> (x & ~y)
16319 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16322 // (x < y) -> (~x & y)
16323 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16326 // (x <= y) -> (~x | y)
16327 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16330 // (x >=y) -> (x | ~y)
16331 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16335 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16337 SDValue Op0 = Op.getOperand(0);
16338 SDValue Op1 = Op.getOperand(1);
16339 SDValue CC = Op.getOperand(2);
16340 MVT VT = Op.getSimpleValueType();
16343 assert(VT.getVectorElementType() == MVT::i1 &&
16344 "Cannot set masked compare for this operation");
16346 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16348 bool Unsigned = false;
16351 switch (SetCCOpcode) {
16352 default: llvm_unreachable("Unexpected SETCC condition");
16353 case ISD::SETNE: SSECC = 4; break;
16354 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16355 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16356 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16357 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16358 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16359 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16360 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16361 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16362 case ISD::SETLE: SSECC = 2; break;
16366 std::swap(Op0, Op1);
16368 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16369 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16370 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16371 DAG.getConstant(SSECC, dl, MVT::i8));
16374 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16375 /// operand \p Op1. If non-trivial (for example because it's not constant)
16376 /// return an empty value.
16377 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16378 SelectionDAG &DAG) {
16379 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16383 MVT VT = Op1.getSimpleValueType();
16384 MVT EVT = VT.getVectorElementType();
16385 unsigned n = VT.getVectorNumElements();
16386 SmallVector<SDValue, 8> ULTOp1;
16388 for (unsigned i = 0; i < n; ++i) {
16389 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16390 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16393 // Avoid underflow.
16394 APInt Val = Elt->getAPIntValue();
16398 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16401 return DAG.getBuildVector(VT, dl, ULTOp1);
16404 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16405 SelectionDAG &DAG) {
16406 SDValue Op0 = Op.getOperand(0);
16407 SDValue Op1 = Op.getOperand(1);
16408 SDValue CC = Op.getOperand(2);
16409 MVT VT = Op.getSimpleValueType();
16410 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16411 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
16416 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
16417 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
16421 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
16422 assert(VT.getVectorNumElements() <= 16);
16423 Opc = X86ISD::CMPM;
16425 Opc = X86ISD::CMPP;
16426 // The SSE/AVX packed FP comparison nodes are defined with a
16427 // floating-point vector result that matches the operand type. This allows
16428 // them to work with an SSE1 target (integer vector types are not legal).
16429 VT = Op0.getSimpleValueType();
16432 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
16433 // emit two comparisons and a logic op to tie them together.
16434 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
16437 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
16439 // LLVM predicate is SETUEQ or SETONE.
16441 unsigned CombineOpc;
16442 if (SetCCOpcode == ISD::SETUEQ) {
16445 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
16446 static_cast<unsigned>(ISD::OR);
16448 assert(SetCCOpcode == ISD::SETONE);
16451 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
16452 static_cast<unsigned>(ISD::AND);
16455 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16456 DAG.getConstant(CC0, dl, MVT::i8));
16457 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16458 DAG.getConstant(CC1, dl, MVT::i8));
16459 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
16461 // Handle all other FP comparisons here.
16462 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
16463 DAG.getConstant(SSECC, dl, MVT::i8));
16466 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
16467 // result type of SETCC. The bitcast is expected to be optimized away
16468 // during combining/isel.
16469 if (Opc == X86ISD::CMPP)
16470 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
16475 MVT VTOp0 = Op0.getSimpleValueType();
16476 assert(VTOp0 == Op1.getSimpleValueType() &&
16477 "Expected operands with same type!");
16478 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
16479 "Invalid number of packed elements for source and destination!");
16481 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
16482 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
16483 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
16484 // legalizer firstly checks if the first operand in input to the setcc has
16485 // a legal type. If so, then it promotes the return type to that same type.
16486 // Otherwise, the return type is promoted to the 'next legal type' which,
16487 // for a vector of MVT::i1 is always a 128-bit integer vector type.
16489 // We reach this code only if the following two conditions are met:
16490 // 1. Both return type and operand type have been promoted to wider types
16491 // by the type legalizer.
16492 // 2. The original operand type has been promoted to a 256-bit vector.
16494 // Note that condition 2. only applies for AVX targets.
16495 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
16496 return DAG.getZExtOrTrunc(NewOp, dl, VT);
16499 // The non-AVX512 code below works under the assumption that source and
16500 // destination types are the same.
16501 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
16502 "Value types for source and destination must be the same!");
16504 // Break 256-bit integer vector compare into smaller ones.
16505 if (VT.is256BitVector() && !Subtarget.hasInt256())
16506 return Lower256IntVSETCC(Op, DAG);
16508 // Operands are boolean (vectors of i1)
16509 MVT OpVT = Op1.getSimpleValueType();
16510 if (OpVT.getVectorElementType() == MVT::i1)
16511 return LowerBoolVSETCC_AVX512(Op, DAG);
16513 // The result is boolean, but operands are int/float
16514 if (VT.getVectorElementType() == MVT::i1) {
16515 // In AVX-512 architecture setcc returns mask with i1 elements,
16516 // But there is no compare instruction for i8 and i16 elements in KNL.
16517 // In this case use SSE compare
16518 bool UseAVX512Inst =
16519 (OpVT.is512BitVector() ||
16520 OpVT.getScalarSizeInBits() >= 32 ||
16521 (Subtarget.hasBWI() && Subtarget.hasVLX()));
16524 return LowerIntVSETCC_AVX512(Op, DAG);
16526 return DAG.getNode(ISD::TRUNCATE, dl, VT,
16527 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
16530 // Lower using XOP integer comparisons.
16531 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
16532 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
16533 // Translate compare code to XOP PCOM compare mode.
16534 unsigned CmpMode = 0;
16535 switch (SetCCOpcode) {
16536 default: llvm_unreachable("Unexpected SETCC condition");
16538 case ISD::SETLT: CmpMode = 0x00; break;
16540 case ISD::SETLE: CmpMode = 0x01; break;
16542 case ISD::SETGT: CmpMode = 0x02; break;
16544 case ISD::SETGE: CmpMode = 0x03; break;
16545 case ISD::SETEQ: CmpMode = 0x04; break;
16546 case ISD::SETNE: CmpMode = 0x05; break;
16549 // Are we comparing unsigned or signed integers?
16550 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
16551 ? X86ISD::VPCOMU : X86ISD::VPCOM;
16553 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16554 DAG.getConstant(CmpMode, dl, MVT::i8));
16557 // We are handling one of the integer comparisons here. Since SSE only has
16558 // GT and EQ comparisons for integer, swapping operands and multiple
16559 // operations may be required for some comparisons.
16561 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
16562 bool Subus = false;
16564 switch (SetCCOpcode) {
16565 default: llvm_unreachable("Unexpected SETCC condition");
16566 case ISD::SETNE: Invert = true;
16567 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
16568 case ISD::SETLT: Swap = true;
16569 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
16570 case ISD::SETGE: Swap = true;
16571 case ISD::SETLE: Opc = X86ISD::PCMPGT;
16572 Invert = true; break;
16573 case ISD::SETULT: Swap = true;
16574 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
16575 FlipSigns = true; break;
16576 case ISD::SETUGE: Swap = true;
16577 case ISD::SETULE: Opc = X86ISD::PCMPGT;
16578 FlipSigns = true; Invert = true; break;
16581 // Special case: Use min/max operations for SETULE/SETUGE
16582 MVT VET = VT.getVectorElementType();
16584 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
16585 || (Subtarget.hasSSE2() && (VET == MVT::i8));
16588 switch (SetCCOpcode) {
16590 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
16591 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
16594 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
16597 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
16598 if (!MinMax && hasSubus) {
16599 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
16601 // t = psubus Op0, Op1
16602 // pcmpeq t, <0..0>
16603 switch (SetCCOpcode) {
16605 case ISD::SETULT: {
16606 // If the comparison is against a constant we can turn this into a
16607 // setule. With psubus, setule does not require a swap. This is
16608 // beneficial because the constant in the register is no longer
16609 // destructed as the destination so it can be hoisted out of a loop.
16610 // Only do this pre-AVX since vpcmp* is no longer destructive.
16611 if (Subtarget.hasAVX())
16613 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
16615 Subus = true; Invert = false; Swap = false;
16619 // Psubus is better than flip-sign because it requires no inversion.
16620 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
16621 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
16625 Opc = X86ISD::SUBUS;
16631 std::swap(Op0, Op1);
16633 // Check that the operation in question is available (most are plain SSE2,
16634 // but PCMPGTQ and PCMPEQQ have different requirements).
16635 if (VT == MVT::v2i64) {
16636 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
16637 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
16639 // First cast everything to the right type.
16640 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16641 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16643 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16644 // bits of the inputs before performing those operations. The lower
16645 // compare is always unsigned.
16648 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
16650 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
16651 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
16652 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
16654 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
16655 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
16657 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
16658 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
16659 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
16661 // Create masks for only the low parts/high parts of the 64 bit integers.
16662 static const int MaskHi[] = { 1, 1, 3, 3 };
16663 static const int MaskLo[] = { 0, 0, 2, 2 };
16664 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
16665 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
16666 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
16668 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
16669 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
16672 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16674 return DAG.getBitcast(VT, Result);
16677 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
16678 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
16679 // pcmpeqd + pshufd + pand.
16680 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
16682 // First cast everything to the right type.
16683 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16684 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16687 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
16689 // Make sure the lower and upper halves are both all-ones.
16690 static const int Mask[] = { 1, 0, 3, 2 };
16691 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
16692 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
16695 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16697 return DAG.getBitcast(VT, Result);
16701 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16702 // bits of the inputs before performing those operations.
16704 MVT EltVT = VT.getVectorElementType();
16705 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
16707 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
16708 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
16711 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
16713 // If the logical-not of the result is required, perform that now.
16715 Result = DAG.getNOT(dl, Result, VT);
16718 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
16721 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
16722 getZeroVector(VT, Subtarget, DAG, dl));
16727 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16729 MVT VT = Op.getSimpleValueType();
16731 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
16733 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
16734 && "SetCC type must be 8-bit or 1-bit integer");
16735 SDValue Op0 = Op.getOperand(0);
16736 SDValue Op1 = Op.getOperand(1);
16738 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16740 // Optimize to BT if possible.
16741 // Lower (X & (1 << N)) == 0 to BT(X, N).
16742 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
16743 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
16744 // Lower (trunc (X >> N) to i1) to BT(X, N).
16745 if (Op0.hasOneUse() && isNullConstant(Op1) &&
16746 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16747 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
16749 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
16754 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
16756 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
16757 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16759 // If the input is a setcc, then reuse the input setcc or use a new one with
16760 // the inverted condition.
16761 if (Op0.getOpcode() == X86ISD::SETCC) {
16762 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
16763 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
16767 CCode = X86::GetOppositeBranchCondition(CCode);
16768 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
16770 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16774 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16775 if (isOneConstant(Op1)) {
16776 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
16777 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
16779 if (!isNullConstant(Op1)) {
16780 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
16781 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
16785 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
16786 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
16787 if (X86CC == X86::COND_INVALID)
16790 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
16791 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
16792 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
16794 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16798 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
16799 SDValue LHS = Op.getOperand(0);
16800 SDValue RHS = Op.getOperand(1);
16801 SDValue Carry = Op.getOperand(2);
16802 SDValue Cond = Op.getOperand(3);
16805 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
16806 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
16808 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
16809 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16810 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
16811 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
16812 if (Op.getSimpleValueType() == MVT::i1)
16813 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
16817 /// Return true if opcode is a X86 logical comparison.
16818 static bool isX86LogicalCmp(SDValue Op) {
16819 unsigned Opc = Op.getOpcode();
16820 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
16821 Opc == X86ISD::SAHF)
16823 if (Op.getResNo() == 1 &&
16824 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
16825 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
16826 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
16827 Opc == X86ISD::XOR || Opc == X86ISD::AND))
16830 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
16836 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
16837 if (V.getOpcode() != ISD::TRUNCATE)
16840 SDValue VOp0 = V.getOperand(0);
16841 unsigned InBits = VOp0.getValueSizeInBits();
16842 unsigned Bits = V.getValueSizeInBits();
16843 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
16846 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16847 bool AddTest = true;
16848 SDValue Cond = Op.getOperand(0);
16849 SDValue Op1 = Op.getOperand(1);
16850 SDValue Op2 = Op.getOperand(2);
16852 MVT VT = Op1.getSimpleValueType();
16855 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
16856 // are available or VBLENDV if AVX is available.
16857 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
16858 if (Cond.getOpcode() == ISD::SETCC &&
16859 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
16860 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
16861 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
16862 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
16863 int SSECC = translateX86FSETCC(
16864 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
16867 if (Subtarget.hasAVX512()) {
16868 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
16869 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
16870 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
16871 DL, VT, Cmp, Op1, Op2);
16874 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16875 DAG.getConstant(SSECC, DL, MVT::i8));
16877 // If we have AVX, we can use a variable vector select (VBLENDV) instead
16878 // of 3 logic instructions for size savings and potentially speed.
16879 // Unfortunately, there is no scalar form of VBLENDV.
16881 // If either operand is a constant, don't try this. We can expect to
16882 // optimize away at least one of the logic instructions later in that
16883 // case, so that sequence would be faster than a variable blend.
16885 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
16886 // uses XMM0 as the selection register. That may need just as many
16887 // instructions as the AND/ANDN/OR sequence due to register moves, so
16890 if (Subtarget.hasAVX() &&
16891 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
16893 // Convert to vectors, do a VSELECT, and convert back to scalar.
16894 // All of the conversions should be optimized away.
16896 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
16897 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
16898 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
16899 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
16901 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
16902 VCmp = DAG.getBitcast(VCmpVT, VCmp);
16904 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
16906 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
16907 VSel, DAG.getIntPtrConstant(0, DL));
16909 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16910 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16911 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16915 // AVX512 fallback is to lower selects of scalar floats to masked moves.
16916 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
16917 Subtarget.hasAVX512())
16918 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
16920 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
16922 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
16923 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
16924 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
16925 Op1Scalar = Op1.getOperand(0);
16927 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
16928 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
16929 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
16930 Op2Scalar = Op2.getOperand(0);
16931 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
16932 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
16933 Op1Scalar.getValueType(),
16934 Cond, Op1Scalar, Op2Scalar);
16935 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
16936 return DAG.getBitcast(VT, newSelect);
16937 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
16938 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
16939 DAG.getIntPtrConstant(0, DL));
16943 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
16944 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
16945 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
16946 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
16947 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
16948 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
16949 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
16951 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
16954 if (Cond.getOpcode() == ISD::SETCC) {
16955 if (SDValue NewCond = LowerSETCC(Cond, DAG))
16959 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16960 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16961 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16962 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16963 if (Cond.getOpcode() == X86ISD::SETCC &&
16964 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16965 isNullConstant(Cond.getOperand(1).getOperand(1))) {
16966 SDValue Cmp = Cond.getOperand(1);
16968 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16970 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
16971 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16972 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
16974 SDValue CmpOp0 = Cmp.getOperand(0);
16975 // Apply further optimizations for special cases
16976 // (select (x != 0), -1, 0) -> neg & sbb
16977 // (select (x == 0), 0, -1) -> neg & sbb
16978 if (isNullConstant(Y) &&
16979 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
16980 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16981 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16982 DAG.getConstant(0, DL,
16983 CmpOp0.getValueType()),
16985 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16986 DAG.getConstant(X86::COND_B, DL, MVT::i8),
16987 SDValue(Neg.getNode(), 1));
16991 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16992 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
16993 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16995 SDValue Res = // Res = 0 or -1.
16996 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16997 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
16999 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17000 Res = DAG.getNOT(DL, Res, Res.getValueType());
17002 if (!isNullConstant(Op2))
17003 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17008 // Look past (and (setcc_carry (cmp ...)), 1).
17009 if (Cond.getOpcode() == ISD::AND &&
17010 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17011 isOneConstant(Cond.getOperand(1)))
17012 Cond = Cond.getOperand(0);
17014 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17015 // setting operand in place of the X86ISD::SETCC.
17016 unsigned CondOpcode = Cond.getOpcode();
17017 if (CondOpcode == X86ISD::SETCC ||
17018 CondOpcode == X86ISD::SETCC_CARRY) {
17019 CC = Cond.getOperand(0);
17021 SDValue Cmp = Cond.getOperand(1);
17022 unsigned Opc = Cmp.getOpcode();
17023 MVT VT = Op.getSimpleValueType();
17025 bool IllegalFPCMov = false;
17026 if (VT.isFloatingPoint() && !VT.isVector() &&
17027 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17028 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17030 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17031 Opc == X86ISD::BT) { // FIXME
17035 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17036 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17037 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17038 Cond.getOperand(0).getValueType() != MVT::i8)) {
17039 SDValue LHS = Cond.getOperand(0);
17040 SDValue RHS = Cond.getOperand(1);
17041 unsigned X86Opcode;
17044 switch (CondOpcode) {
17045 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17046 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17047 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17048 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17049 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17050 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17051 default: llvm_unreachable("unexpected overflowing operator");
17053 if (CondOpcode == ISD::UMULO)
17054 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17057 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17059 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17061 if (CondOpcode == ISD::UMULO)
17062 Cond = X86Op.getValue(2);
17064 Cond = X86Op.getValue(1);
17066 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17071 // Look past the truncate if the high bits are known zero.
17072 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17073 Cond = Cond.getOperand(0);
17075 // We know the result of AND is compared against zero. Try to match
17077 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17078 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17079 CC = NewSetCC.getOperand(0);
17080 Cond = NewSetCC.getOperand(1);
17087 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17088 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17091 // a < b ? -1 : 0 -> RES = ~setcc_carry
17092 // a < b ? 0 : -1 -> RES = setcc_carry
17093 // a >= b ? -1 : 0 -> RES = setcc_carry
17094 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17095 if (Cond.getOpcode() == X86ISD::SUB) {
17096 Cond = ConvertCmpIfNecessary(Cond, DAG);
17097 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17099 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17100 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17101 (isNullConstant(Op1) || isNullConstant(Op2))) {
17102 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17103 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17105 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17106 return DAG.getNOT(DL, Res, Res.getValueType());
17111 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17112 // widen the cmov and push the truncate through. This avoids introducing a new
17113 // branch during isel and doesn't add any extensions.
17114 if (Op.getValueType() == MVT::i8 &&
17115 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17116 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17117 if (T1.getValueType() == T2.getValueType() &&
17118 // Blacklist CopyFromReg to avoid partial register stalls.
17119 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17120 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17121 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17122 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17126 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17127 // condition is true.
17128 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17129 SDValue Ops[] = { Op2, Op1, CC, Cond };
17130 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17133 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17134 const X86Subtarget &Subtarget,
17135 SelectionDAG &DAG) {
17136 MVT VT = Op->getSimpleValueType(0);
17137 SDValue In = Op->getOperand(0);
17138 MVT InVT = In.getSimpleValueType();
17139 MVT VTElt = VT.getVectorElementType();
17140 MVT InVTElt = InVT.getVectorElementType();
17144 if ((InVTElt == MVT::i1) &&
17145 (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
17146 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
17148 ((Subtarget.hasBWI() && VT.is512BitVector() &&
17149 VTElt.getSizeInBits() <= 16)) ||
17151 ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
17152 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
17154 ((Subtarget.hasDQI() && VT.is512BitVector() &&
17155 VTElt.getSizeInBits() >= 32))))
17156 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17158 unsigned NumElts = VT.getVectorNumElements();
17160 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
17163 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
17164 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17165 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
17166 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17169 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
17170 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17171 SDValue NegOne = DAG.getConstant(
17172 APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
17173 SDValue Zero = DAG.getConstant(
17174 APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
17176 SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17177 if (VT.is512BitVector())
17179 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17182 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17183 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17184 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17185 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17186 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17187 const X86Subtarget &Subtarget,
17188 SelectionDAG &DAG) {
17189 SDValue In = Op->getOperand(0);
17190 MVT VT = Op->getSimpleValueType(0);
17191 MVT InVT = In.getSimpleValueType();
17192 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17194 MVT SVT = VT.getVectorElementType();
17195 MVT InSVT = InVT.getVectorElementType();
17196 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17198 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17200 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17202 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17203 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17204 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17209 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17210 // For 512-bit vectors, we need 128-bits or 256-bits.
17211 if (VT.getSizeInBits() > 128) {
17212 // Input needs to be at least the same number of elements as output, and
17213 // at least 128-bits.
17214 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17215 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17218 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17219 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17221 // SSE41 targets can use the pmovsx* instructions directly.
17222 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17223 X86ISD::VSEXT : X86ISD::VZEXT;
17224 if (Subtarget.hasSSE41())
17225 return DAG.getNode(ExtOpc, dl, VT, In);
17227 // We should only get here for sign extend.
17228 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17229 "Unexpected opcode!");
17231 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17235 // As SRAI is only available on i16/i32 types, we expand only up to i32
17236 // and handle i64 separately.
17237 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17238 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17239 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17240 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17241 Curr = DAG.getBitcast(CurrVT, Curr);
17244 SDValue SignExt = Curr;
17245 if (CurrVT != InVT) {
17246 unsigned SignExtShift =
17247 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17248 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17249 DAG.getConstant(SignExtShift, dl, MVT::i8));
17255 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17256 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17257 DAG.getConstant(31, dl, MVT::i8));
17258 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17259 return DAG.getBitcast(VT, Ext);
17265 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17266 SelectionDAG &DAG) {
17267 MVT VT = Op->getSimpleValueType(0);
17268 SDValue In = Op->getOperand(0);
17269 MVT InVT = In.getSimpleValueType();
17272 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17273 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17275 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17276 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17277 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17280 if (Subtarget.hasInt256())
17281 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17283 // Optimize vectors in AVX mode
17284 // Sign extend v8i16 to v8i32 and
17287 // Divide input vector into two parts
17288 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17289 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17290 // concat the vectors to original VT
17292 unsigned NumElems = InVT.getVectorNumElements();
17293 SDValue Undef = DAG.getUNDEF(InVT);
17295 SmallVector<int,8> ShufMask1(NumElems, -1);
17296 for (unsigned i = 0; i != NumElems/2; ++i)
17299 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17301 SmallVector<int,8> ShufMask2(NumElems, -1);
17302 for (unsigned i = 0; i != NumElems/2; ++i)
17303 ShufMask2[i] = i + NumElems/2;
17305 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17307 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17308 VT.getVectorNumElements() / 2);
17310 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
17311 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
17313 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17316 // Lower truncating store. We need a special lowering to vXi1 vectors
17317 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17318 SelectionDAG &DAG) {
17319 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17321 EVT MemVT = St->getMemoryVT();
17322 assert(St->isTruncatingStore() && "We only custom truncating store.");
17323 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17324 "Expected truncstore of i1 vector");
17326 SDValue Op = St->getValue();
17327 MVT OpVT = Op.getValueType().getSimpleVT();
17328 unsigned NumElts = OpVT.getVectorNumElements();
17329 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17331 // Truncate and store - everything is legal
17332 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17333 if (MemVT.getSizeInBits() < 8)
17334 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17335 DAG.getUNDEF(MVT::v8i1), Op,
17336 DAG.getIntPtrConstant(0, dl));
17337 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17338 St->getMemOperand());
17341 // A subset, assume that we have only AVX-512F
17342 if (NumElts <= 8) {
17344 // Extend to 8-elts vector
17345 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17346 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17347 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17349 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17350 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17351 St->getMemOperand());
17354 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17355 // Divide the vector into 2 parts and store each part separately
17356 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17357 DAG.getIntPtrConstant(0, dl));
17358 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
17359 SDValue BasePtr = St->getBasePtr();
17360 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
17361 St->getMemOperand());
17362 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17363 DAG.getIntPtrConstant(16, dl));
17364 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
17366 SDValue BasePtrHi =
17367 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17368 DAG.getConstant(2, dl, BasePtr.getValueType()));
17370 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
17371 BasePtrHi, St->getMemOperand());
17372 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
17375 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
17376 const X86Subtarget &Subtarget,
17377 SelectionDAG &DAG) {
17379 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17381 EVT MemVT = Ld->getMemoryVT();
17382 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
17383 "Expected i1 vector load");
17384 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
17385 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17386 MVT VT = Op.getValueType().getSimpleVT();
17387 unsigned NumElts = VT.getVectorNumElements();
17389 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17391 // Load and extend - everything is legal
17393 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
17395 Ld->getMemOperand());
17396 // Replace chain users with the new chain.
17397 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17398 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17399 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17400 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
17402 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17403 DAG.getIntPtrConstant(0, dl));
17405 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
17407 Ld->getMemOperand());
17408 // Replace chain users with the new chain.
17409 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17410 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17412 // Finally, do a normal sign-extend to the desired register.
17413 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
17416 if (NumElts <= 8) {
17417 // A subset, assume that we have only AVX-512F
17418 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
17419 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
17420 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
17422 Ld->getMemOperand());
17423 // Replace chain users with the new chain.
17424 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17425 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17427 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
17428 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
17431 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
17433 // we should take care to v4i1 and v2i1
17435 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17436 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
17437 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17438 DAG.getIntPtrConstant(0, dl));
17441 assert(VT == MVT::v32i8 && "Unexpected extload type");
17443 SmallVector<SDValue, 2> Chains;
17445 SDValue BasePtr = Ld->getBasePtr();
17446 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17448 Ld->getMemOperand());
17449 Chains.push_back(LoadLo.getValue(1));
17451 SDValue BasePtrHi =
17452 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17453 DAG.getConstant(2, dl, BasePtr.getValueType()));
17455 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17457 Ld->getMemOperand());
17458 Chains.push_back(LoadHi.getValue(1));
17459 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17460 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
17462 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
17463 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
17464 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
17467 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
17468 // may emit an illegal shuffle but the expansion is still better than scalar
17469 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
17470 // we'll emit a shuffle and a arithmetic shift.
17471 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
17472 // TODO: It is possible to support ZExt by zeroing the undef values during
17473 // the shuffle phase or after the shuffle.
17474 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
17475 SelectionDAG &DAG) {
17476 MVT RegVT = Op.getSimpleValueType();
17477 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
17478 assert(RegVT.isInteger() &&
17479 "We only custom lower integer vector sext loads.");
17481 // Nothing useful we can do without SSE2 shuffles.
17482 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
17484 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17486 EVT MemVT = Ld->getMemoryVT();
17487 if (MemVT.getScalarType() == MVT::i1)
17488 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
17490 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17491 unsigned RegSz = RegVT.getSizeInBits();
17493 ISD::LoadExtType Ext = Ld->getExtensionType();
17495 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
17496 && "Only anyext and sext are currently implemented.");
17497 assert(MemVT != RegVT && "Cannot extend to the same type");
17498 assert(MemVT.isVector() && "Must load a vector from memory");
17500 unsigned NumElems = RegVT.getVectorNumElements();
17501 unsigned MemSz = MemVT.getSizeInBits();
17502 assert(RegSz > MemSz && "Register size must be greater than the mem size");
17504 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
17505 // The only way in which we have a legal 256-bit vector result but not the
17506 // integer 256-bit operations needed to directly lower a sextload is if we
17507 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
17508 // a 128-bit vector and a normal sign_extend to 256-bits that should get
17509 // correctly legalized. We do this late to allow the canonical form of
17510 // sextload to persist throughout the rest of the DAG combiner -- it wants
17511 // to fold together any extensions it can, and so will fuse a sign_extend
17512 // of an sextload into a sextload targeting a wider value.
17514 if (MemSz == 128) {
17515 // Just switch this to a normal load.
17516 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
17517 "it must be a legal 128-bit vector "
17519 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
17520 Ld->getPointerInfo(), Ld->getAlignment(),
17521 Ld->getMemOperand()->getFlags());
17523 assert(MemSz < 128 &&
17524 "Can't extend a type wider than 128 bits to a 256 bit vector!");
17525 // Do an sext load to a 128-bit vector type. We want to use the same
17526 // number of elements, but elements half as wide. This will end up being
17527 // recursively lowered by this routine, but will succeed as we definitely
17528 // have all the necessary features if we're using AVX1.
17530 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
17531 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
17533 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
17534 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
17535 Ld->getMemOperand()->getFlags());
17538 // Replace chain users with the new chain.
17539 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17540 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17542 // Finally, do a normal sign-extend to the desired register.
17543 return DAG.getSExtOrTrunc(Load, dl, RegVT);
17546 // All sizes must be a power of two.
17547 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
17548 "Non-power-of-two elements are not custom lowered!");
17550 // Attempt to load the original value using scalar loads.
17551 // Find the largest scalar type that divides the total loaded size.
17552 MVT SclrLoadTy = MVT::i8;
17553 for (MVT Tp : MVT::integer_valuetypes()) {
17554 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
17559 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
17560 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
17562 SclrLoadTy = MVT::f64;
17564 // Calculate the number of scalar loads that we need to perform
17565 // in order to load our vector from memory.
17566 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
17568 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
17569 "Can only lower sext loads with a single scalar load!");
17571 unsigned loadRegZize = RegSz;
17572 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
17575 // Represent our vector as a sequence of elements which are the
17576 // largest scalar that we can load.
17577 EVT LoadUnitVecVT = EVT::getVectorVT(
17578 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
17580 // Represent the data using the same element type that is stored in
17581 // memory. In practice, we ''widen'' MemVT.
17583 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
17584 loadRegZize / MemVT.getScalarSizeInBits());
17586 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
17587 "Invalid vector type");
17589 // We can't shuffle using an illegal type.
17590 assert(TLI.isTypeLegal(WideVecVT) &&
17591 "We only lower types that form legal widened vector types");
17593 SmallVector<SDValue, 8> Chains;
17594 SDValue Ptr = Ld->getBasePtr();
17595 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
17596 TLI.getPointerTy(DAG.getDataLayout()));
17597 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
17599 for (unsigned i = 0; i < NumLoads; ++i) {
17600 // Perform a single load.
17601 SDValue ScalarLoad =
17602 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
17603 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
17604 Chains.push_back(ScalarLoad.getValue(1));
17605 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
17606 // another round of DAGCombining.
17608 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
17610 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
17611 ScalarLoad, DAG.getIntPtrConstant(i, dl));
17613 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17616 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17618 // Bitcast the loaded value to a vector of the original element type, in
17619 // the size of the target vector type.
17620 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
17621 unsigned SizeRatio = RegSz / MemSz;
17623 if (Ext == ISD::SEXTLOAD) {
17624 // If we have SSE4.1, we can directly emit a VSEXT node.
17625 if (Subtarget.hasSSE41()) {
17626 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
17627 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17631 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
17633 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
17634 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
17636 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
17637 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17641 // Redistribute the loaded elements into the different locations.
17642 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
17643 for (unsigned i = 0; i != NumElems; ++i)
17644 ShuffleVec[i * SizeRatio] = i;
17646 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
17647 DAG.getUNDEF(WideVecVT), ShuffleVec);
17649 // Bitcast to the requested type.
17650 Shuff = DAG.getBitcast(RegVT, Shuff);
17651 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17655 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
17656 /// each of which has no other use apart from the AND / OR.
17657 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
17658 Opc = Op.getOpcode();
17659 if (Opc != ISD::OR && Opc != ISD::AND)
17661 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17662 Op.getOperand(0).hasOneUse() &&
17663 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
17664 Op.getOperand(1).hasOneUse());
17667 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
17668 /// SETCC node has a single use.
17669 static bool isXor1OfSetCC(SDValue Op) {
17670 if (Op.getOpcode() != ISD::XOR)
17672 if (isOneConstant(Op.getOperand(1)))
17673 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17674 Op.getOperand(0).hasOneUse();
17678 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
17679 bool addTest = true;
17680 SDValue Chain = Op.getOperand(0);
17681 SDValue Cond = Op.getOperand(1);
17682 SDValue Dest = Op.getOperand(2);
17685 bool Inverted = false;
17687 if (Cond.getOpcode() == ISD::SETCC) {
17688 // Check for setcc([su]{add,sub,mul}o == 0).
17689 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
17690 isNullConstant(Cond.getOperand(1)) &&
17691 Cond.getOperand(0).getResNo() == 1 &&
17692 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
17693 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
17694 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
17695 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
17696 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
17697 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
17699 Cond = Cond.getOperand(0);
17701 if (SDValue NewCond = LowerSETCC(Cond, DAG))
17706 // FIXME: LowerXALUO doesn't handle these!!
17707 else if (Cond.getOpcode() == X86ISD::ADD ||
17708 Cond.getOpcode() == X86ISD::SUB ||
17709 Cond.getOpcode() == X86ISD::SMUL ||
17710 Cond.getOpcode() == X86ISD::UMUL)
17711 Cond = LowerXALUO(Cond, DAG);
17714 // Look pass (and (setcc_carry (cmp ...)), 1).
17715 if (Cond.getOpcode() == ISD::AND &&
17716 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17717 isOneConstant(Cond.getOperand(1)))
17718 Cond = Cond.getOperand(0);
17720 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17721 // setting operand in place of the X86ISD::SETCC.
17722 unsigned CondOpcode = Cond.getOpcode();
17723 if (CondOpcode == X86ISD::SETCC ||
17724 CondOpcode == X86ISD::SETCC_CARRY) {
17725 CC = Cond.getOperand(0);
17727 SDValue Cmp = Cond.getOperand(1);
17728 unsigned Opc = Cmp.getOpcode();
17729 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
17730 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
17734 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
17738 // These can only come from an arithmetic instruction with overflow,
17739 // e.g. SADDO, UADDO.
17740 Cond = Cond.getOperand(1);
17746 CondOpcode = Cond.getOpcode();
17747 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17748 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17749 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17750 Cond.getOperand(0).getValueType() != MVT::i8)) {
17751 SDValue LHS = Cond.getOperand(0);
17752 SDValue RHS = Cond.getOperand(1);
17753 unsigned X86Opcode;
17756 // Keep this in sync with LowerXALUO, otherwise we might create redundant
17757 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
17759 switch (CondOpcode) {
17760 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17762 if (isOneConstant(RHS)) {
17763 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
17766 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17767 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17769 if (isOneConstant(RHS)) {
17770 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
17773 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17774 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17775 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17776 default: llvm_unreachable("unexpected overflowing operator");
17779 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
17780 if (CondOpcode == ISD::UMULO)
17781 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17784 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17786 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
17788 if (CondOpcode == ISD::UMULO)
17789 Cond = X86Op.getValue(2);
17791 Cond = X86Op.getValue(1);
17793 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
17797 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
17798 SDValue Cmp = Cond.getOperand(0).getOperand(1);
17799 if (CondOpc == ISD::OR) {
17800 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
17801 // two branches instead of an explicit OR instruction with a
17803 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17804 isX86LogicalCmp(Cmp)) {
17805 CC = Cond.getOperand(0).getOperand(0);
17806 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17807 Chain, Dest, CC, Cmp);
17808 CC = Cond.getOperand(1).getOperand(0);
17812 } else { // ISD::AND
17813 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
17814 // two branches instead of an explicit AND instruction with a
17815 // separate test. However, we only do this if this block doesn't
17816 // have a fall-through edge, because this requires an explicit
17817 // jmp when the condition is false.
17818 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17819 isX86LogicalCmp(Cmp) &&
17820 Op.getNode()->hasOneUse()) {
17821 X86::CondCode CCode =
17822 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17823 CCode = X86::GetOppositeBranchCondition(CCode);
17824 CC = DAG.getConstant(CCode, dl, MVT::i8);
17825 SDNode *User = *Op.getNode()->use_begin();
17826 // Look for an unconditional branch following this conditional branch.
17827 // We need this because we need to reverse the successors in order
17828 // to implement FCMP_OEQ.
17829 if (User->getOpcode() == ISD::BR) {
17830 SDValue FalseBB = User->getOperand(1);
17832 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17833 assert(NewBR == User);
17837 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17838 Chain, Dest, CC, Cmp);
17839 X86::CondCode CCode =
17840 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
17841 CCode = X86::GetOppositeBranchCondition(CCode);
17842 CC = DAG.getConstant(CCode, dl, MVT::i8);
17848 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
17849 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
17850 // It should be transformed during dag combiner except when the condition
17851 // is set by a arithmetics with overflow node.
17852 X86::CondCode CCode =
17853 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17854 CCode = X86::GetOppositeBranchCondition(CCode);
17855 CC = DAG.getConstant(CCode, dl, MVT::i8);
17856 Cond = Cond.getOperand(0).getOperand(1);
17858 } else if (Cond.getOpcode() == ISD::SETCC &&
17859 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
17860 // For FCMP_OEQ, we can emit
17861 // two branches instead of an explicit AND instruction with a
17862 // separate test. However, we only do this if this block doesn't
17863 // have a fall-through edge, because this requires an explicit
17864 // jmp when the condition is false.
17865 if (Op.getNode()->hasOneUse()) {
17866 SDNode *User = *Op.getNode()->use_begin();
17867 // Look for an unconditional branch following this conditional branch.
17868 // We need this because we need to reverse the successors in order
17869 // to implement FCMP_OEQ.
17870 if (User->getOpcode() == ISD::BR) {
17871 SDValue FalseBB = User->getOperand(1);
17873 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17874 assert(NewBR == User);
17878 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17879 Cond.getOperand(0), Cond.getOperand(1));
17880 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17881 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
17882 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17883 Chain, Dest, CC, Cmp);
17884 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
17889 } else if (Cond.getOpcode() == ISD::SETCC &&
17890 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
17891 // For FCMP_UNE, we can emit
17892 // two branches instead of an explicit AND instruction with a
17893 // separate test. However, we only do this if this block doesn't
17894 // have a fall-through edge, because this requires an explicit
17895 // jmp when the condition is false.
17896 if (Op.getNode()->hasOneUse()) {
17897 SDNode *User = *Op.getNode()->use_begin();
17898 // Look for an unconditional branch following this conditional branch.
17899 // We need this because we need to reverse the successors in order
17900 // to implement FCMP_UNE.
17901 if (User->getOpcode() == ISD::BR) {
17902 SDValue FalseBB = User->getOperand(1);
17904 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17905 assert(NewBR == User);
17908 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17909 Cond.getOperand(0), Cond.getOperand(1));
17910 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17911 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
17912 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17913 Chain, Dest, CC, Cmp);
17914 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
17924 // Look pass the truncate if the high bits are known zero.
17925 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17926 Cond = Cond.getOperand(0);
17928 // We know the result is compared against zero. Try to match it to BT.
17929 if (Cond.hasOneUse()) {
17930 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
17931 CC = NewSetCC.getOperand(0);
17932 Cond = NewSetCC.getOperand(1);
17939 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
17940 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
17941 Cond = EmitTest(Cond, X86Cond, dl, DAG);
17943 Cond = ConvertCmpIfNecessary(Cond, DAG);
17944 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17945 Chain, Dest, CC, Cond);
17948 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
17949 // Calls to _alloca are needed to probe the stack when allocating more than 4k
17950 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
17951 // that the guard pages used by the OS virtual memory manager are allocated in
17952 // correct sequence.
17954 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17955 SelectionDAG &DAG) const {
17956 MachineFunction &MF = DAG.getMachineFunction();
17957 bool SplitStack = MF.shouldSplitStack();
17958 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
17963 SDNode *Node = Op.getNode();
17964 SDValue Chain = Op.getOperand(0);
17965 SDValue Size = Op.getOperand(1);
17966 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
17967 EVT VT = Node->getValueType(0);
17969 // Chain the dynamic stack allocation so that it doesn't modify the stack
17970 // pointer when other instructions are using the stack.
17971 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
17973 bool Is64Bit = Subtarget.is64Bit();
17974 MVT SPTy = getPointerTy(DAG.getDataLayout());
17978 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17979 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
17980 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
17981 " not tell us which reg is the stack pointer!");
17983 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
17984 Chain = SP.getValue(1);
17985 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
17986 unsigned StackAlign = TFI.getStackAlignment();
17987 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
17988 if (Align > StackAlign)
17989 Result = DAG.getNode(ISD::AND, dl, VT, Result,
17990 DAG.getConstant(-(uint64_t)Align, dl, VT));
17991 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
17992 } else if (SplitStack) {
17993 MachineRegisterInfo &MRI = MF.getRegInfo();
17996 // The 64 bit implementation of segmented stacks needs to clobber both r10
17997 // r11. This makes it impossible to use it along with nested parameters.
17998 const Function *F = MF.getFunction();
17999 for (const auto &A : F->args()) {
18000 if (A.hasNestAttr())
18001 report_fatal_error("Cannot use segmented stacks with functions that "
18002 "have nested arguments.");
18006 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18007 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18008 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18009 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18010 DAG.getRegister(Vreg, SPTy));
18012 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18013 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18014 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18016 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18017 unsigned SPReg = RegInfo->getStackRegister();
18018 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18019 Chain = SP.getValue(1);
18022 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18023 DAG.getConstant(-(uint64_t)Align, dl, VT));
18024 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18030 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18031 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18033 SDValue Ops[2] = {Result, Chain};
18034 return DAG.getMergeValues(Ops, dl);
18037 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18038 MachineFunction &MF = DAG.getMachineFunction();
18039 auto PtrVT = getPointerTy(MF.getDataLayout());
18040 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18042 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18045 if (!Subtarget.is64Bit() ||
18046 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18047 // vastart just stores the address of the VarArgsFrameIndex slot into the
18048 // memory location argument.
18049 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18050 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18051 MachinePointerInfo(SV));
18055 // gp_offset (0 - 6 * 8)
18056 // fp_offset (48 - 48 + 8 * 16)
18057 // overflow_arg_area (point to parameters coming in memory).
18059 SmallVector<SDValue, 8> MemOps;
18060 SDValue FIN = Op.getOperand(1);
18062 SDValue Store = DAG.getStore(
18063 Op.getOperand(0), DL,
18064 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18065 MachinePointerInfo(SV));
18066 MemOps.push_back(Store);
18069 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18070 Store = DAG.getStore(
18071 Op.getOperand(0), DL,
18072 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18073 MachinePointerInfo(SV, 4));
18074 MemOps.push_back(Store);
18076 // Store ptr to overflow_arg_area
18077 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18078 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18080 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18081 MemOps.push_back(Store);
18083 // Store ptr to reg_save_area.
18084 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18085 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18086 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18087 Store = DAG.getStore(
18088 Op.getOperand(0), DL, RSFIN, FIN,
18089 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18090 MemOps.push_back(Store);
18091 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18094 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18095 assert(Subtarget.is64Bit() &&
18096 "LowerVAARG only handles 64-bit va_arg!");
18097 assert(Op.getNumOperands() == 4);
18099 MachineFunction &MF = DAG.getMachineFunction();
18100 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18101 // The Win64 ABI uses char* instead of a structure.
18102 return DAG.expandVAArg(Op.getNode());
18104 SDValue Chain = Op.getOperand(0);
18105 SDValue SrcPtr = Op.getOperand(1);
18106 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18107 unsigned Align = Op.getConstantOperandVal(3);
18110 EVT ArgVT = Op.getNode()->getValueType(0);
18111 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18112 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18115 // Decide which area this value should be read from.
18116 // TODO: Implement the AMD64 ABI in its entirety. This simple
18117 // selection mechanism works only for the basic types.
18118 if (ArgVT == MVT::f80) {
18119 llvm_unreachable("va_arg for f80 not yet implemented");
18120 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18121 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18122 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18123 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18125 llvm_unreachable("Unhandled argument type in LowerVAARG");
18128 if (ArgMode == 2) {
18129 // Sanity Check: Make sure using fp_offset makes sense.
18130 assert(!Subtarget.useSoftFloat() &&
18131 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18132 Subtarget.hasSSE1());
18135 // Insert VAARG_64 node into the DAG
18136 // VAARG_64 returns two values: Variable Argument Address, Chain
18137 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18138 DAG.getConstant(ArgMode, dl, MVT::i8),
18139 DAG.getConstant(Align, dl, MVT::i32)};
18140 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18141 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18142 VTs, InstOps, MVT::i64,
18143 MachinePointerInfo(SV),
18145 /*Volatile=*/false,
18147 /*WriteMem=*/true);
18148 Chain = VAARG.getValue(1);
18150 // Load the next argument and return it
18151 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18154 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18155 SelectionDAG &DAG) {
18156 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18157 // where a va_list is still an i8*.
18158 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18159 if (Subtarget.isCallingConvWin64(
18160 DAG.getMachineFunction().getFunction()->getCallingConv()))
18161 // Probably a Win64 va_copy.
18162 return DAG.expandVACopy(Op.getNode());
18164 SDValue Chain = Op.getOperand(0);
18165 SDValue DstPtr = Op.getOperand(1);
18166 SDValue SrcPtr = Op.getOperand(2);
18167 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18168 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18171 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18172 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18174 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18177 /// Handle vector element shifts where the shift amount is a constant.
18178 /// Takes immediate version of shift as input.
18179 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18180 SDValue SrcOp, uint64_t ShiftAmt,
18181 SelectionDAG &DAG) {
18182 MVT ElementType = VT.getVectorElementType();
18184 // Fold this packed shift into its first operand if ShiftAmt is 0.
18188 // Check for ShiftAmt >= element width
18189 if (ShiftAmt >= ElementType.getSizeInBits()) {
18190 if (Opc == X86ISD::VSRAI)
18191 ShiftAmt = ElementType.getSizeInBits() - 1;
18193 return DAG.getConstant(0, dl, VT);
18196 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18197 && "Unknown target vector shift-by-constant node");
18199 // Fold this packed vector shift into a build vector if SrcOp is a
18200 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
18201 if (VT == SrcOp.getSimpleValueType() &&
18202 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18203 SmallVector<SDValue, 8> Elts;
18204 unsigned NumElts = SrcOp->getNumOperands();
18205 ConstantSDNode *ND;
18208 default: llvm_unreachable("Unknown opcode!");
18209 case X86ISD::VSHLI:
18210 for (unsigned i=0; i!=NumElts; ++i) {
18211 SDValue CurrentOp = SrcOp->getOperand(i);
18212 if (CurrentOp->isUndef()) {
18213 Elts.push_back(CurrentOp);
18216 ND = cast<ConstantSDNode>(CurrentOp);
18217 const APInt &C = ND->getAPIntValue();
18218 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18221 case X86ISD::VSRLI:
18222 for (unsigned i=0; i!=NumElts; ++i) {
18223 SDValue CurrentOp = SrcOp->getOperand(i);
18224 if (CurrentOp->isUndef()) {
18225 Elts.push_back(CurrentOp);
18228 ND = cast<ConstantSDNode>(CurrentOp);
18229 const APInt &C = ND->getAPIntValue();
18230 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18233 case X86ISD::VSRAI:
18234 for (unsigned i=0; i!=NumElts; ++i) {
18235 SDValue CurrentOp = SrcOp->getOperand(i);
18236 if (CurrentOp->isUndef()) {
18237 Elts.push_back(CurrentOp);
18240 ND = cast<ConstantSDNode>(CurrentOp);
18241 const APInt &C = ND->getAPIntValue();
18242 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18247 return DAG.getBuildVector(VT, dl, Elts);
18250 return DAG.getNode(Opc, dl, VT, SrcOp,
18251 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18254 /// Handle vector element shifts where the shift amount may or may not be a
18255 /// constant. Takes immediate version of shift as input.
18256 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18257 SDValue SrcOp, SDValue ShAmt,
18258 SelectionDAG &DAG) {
18259 MVT SVT = ShAmt.getSimpleValueType();
18260 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18262 // Catch shift-by-constant.
18263 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18264 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18265 CShAmt->getZExtValue(), DAG);
18267 // Change opcode to non-immediate version
18269 default: llvm_unreachable("Unknown target vector shift node");
18270 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18271 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18272 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18275 const X86Subtarget &Subtarget =
18276 static_cast<const X86Subtarget &>(DAG.getSubtarget());
18277 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18278 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18279 // Let the shuffle legalizer expand this shift amount node.
18280 SDValue Op0 = ShAmt.getOperand(0);
18281 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
18282 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
18284 // Need to build a vector containing shift amount.
18285 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18286 SmallVector<SDValue, 4> ShOps;
18287 ShOps.push_back(ShAmt);
18288 if (SVT == MVT::i32) {
18289 ShOps.push_back(DAG.getConstant(0, dl, SVT));
18290 ShOps.push_back(DAG.getUNDEF(SVT));
18292 ShOps.push_back(DAG.getUNDEF(SVT));
18294 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
18295 ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
18298 // The return type has to be a 128-bit type with the same element
18299 // type as the input type.
18300 MVT EltVT = VT.getVectorElementType();
18301 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18303 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18304 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18307 /// \brief Return Mask with the necessary casting or extending
18308 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18309 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18310 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18313 if (isAllOnesConstant(Mask))
18314 return DAG.getTargetConstant(1, dl, MaskVT);
18315 if (X86::isZeroNode(Mask))
18316 return DAG.getTargetConstant(0, dl, MaskVT);
18318 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18319 // Mask should be extended
18320 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18321 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18324 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18325 if (MaskVT == MVT::v64i1) {
18326 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18327 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18329 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18330 DAG.getConstant(0, dl, MVT::i32));
18331 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18332 DAG.getConstant(1, dl, MVT::i32));
18334 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18335 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18337 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18339 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18341 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18342 return DAG.getBitcast(MaskVT,
18343 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18347 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18348 Mask.getSimpleValueType().getSizeInBits());
18349 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
18350 // are extracted by EXTRACT_SUBVECTOR.
18351 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18352 DAG.getBitcast(BitcastVT, Mask),
18353 DAG.getIntPtrConstant(0, dl));
18357 /// \brief Return (and \p Op, \p Mask) for compare instructions or
18358 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
18359 /// necessary casting or extending for \p Mask when lowering masking intrinsics
18360 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
18361 SDValue PreservedSrc,
18362 const X86Subtarget &Subtarget,
18363 SelectionDAG &DAG) {
18364 MVT VT = Op.getSimpleValueType();
18365 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18366 unsigned OpcodeSelect = ISD::VSELECT;
18369 if (isAllOnesConstant(Mask))
18372 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18374 switch (Op.getOpcode()) {
18376 case X86ISD::PCMPEQM:
18377 case X86ISD::PCMPGTM:
18379 case X86ISD::CMPMU:
18380 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
18381 case X86ISD::VFPCLASS:
18382 case X86ISD::VFPCLASSS:
18383 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
18384 case X86ISD::VTRUNC:
18385 case X86ISD::VTRUNCS:
18386 case X86ISD::VTRUNCUS:
18387 case X86ISD::CVTPS2PH:
18388 // We can't use ISD::VSELECT here because it is not always "Legal"
18389 // for the destination type. For example vpmovqb require only AVX512
18390 // and vselect that can operate on byte element type require BWI
18391 OpcodeSelect = X86ISD::SELECT;
18394 if (PreservedSrc.isUndef())
18395 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18396 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
18399 /// \brief Creates an SDNode for a predicated scalar operation.
18400 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
18401 /// The mask is coming as MVT::i8 and it should be truncated
18402 /// to MVT::i1 while lowering masking intrinsics.
18403 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
18404 /// "X86select" instead of "vselect". We just can't create the "vselect" node
18405 /// for a scalar instruction.
18406 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
18407 SDValue PreservedSrc,
18408 const X86Subtarget &Subtarget,
18409 SelectionDAG &DAG) {
18410 if (isAllOnesConstant(Mask))
18413 MVT VT = Op.getSimpleValueType();
18415 // The mask should be of type MVT::i1
18416 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
18418 if (Op.getOpcode() == X86ISD::FSETCCM ||
18419 Op.getOpcode() == X86ISD::FSETCCM_RND)
18420 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
18421 if (Op.getOpcode() == X86ISD::VFPCLASS ||
18422 Op.getOpcode() == X86ISD::VFPCLASSS)
18423 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
18425 if (PreservedSrc.isUndef())
18426 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18427 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
18430 static int getSEHRegistrationNodeSize(const Function *Fn) {
18431 if (!Fn->hasPersonalityFn())
18432 report_fatal_error(
18433 "querying registration node size for function without personality");
18434 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
18435 // WinEHStatePass for the full struct definition.
18436 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
18437 case EHPersonality::MSVC_X86SEH: return 24;
18438 case EHPersonality::MSVC_CXX: return 16;
18441 report_fatal_error(
18442 "can only recover FP for 32-bit MSVC EH personality functions");
18445 /// When the MSVC runtime transfers control to us, either to an outlined
18446 /// function or when returning to a parent frame after catching an exception, we
18447 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
18448 /// Here's the math:
18449 /// RegNodeBase = EntryEBP - RegNodeSize
18450 /// ParentFP = RegNodeBase - ParentFrameOffset
18451 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
18452 /// subtracting the offset (negative on x86) takes us back to the parent FP.
18453 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
18454 SDValue EntryEBP) {
18455 MachineFunction &MF = DAG.getMachineFunction();
18458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18459 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18461 // It's possible that the parent function no longer has a personality function
18462 // if the exceptional code was optimized away, in which case we just return
18463 // the incoming EBP.
18464 if (!Fn->hasPersonalityFn())
18467 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
18468 // registration, or the .set_setframe offset.
18469 MCSymbol *OffsetSym =
18470 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
18471 GlobalValue::getRealLinkageName(Fn->getName()));
18472 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
18473 SDValue ParentFrameOffset =
18474 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
18476 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
18477 // prologue to RBP in the parent function.
18478 const X86Subtarget &Subtarget =
18479 static_cast<const X86Subtarget &>(DAG.getSubtarget());
18480 if (Subtarget.is64Bit())
18481 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
18483 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
18484 // RegNodeBase = EntryEBP - RegNodeSize
18485 // ParentFP = RegNodeBase - ParentFrameOffset
18486 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
18487 DAG.getConstant(RegNodeSize, dl, PtrVT));
18488 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
18491 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18492 SelectionDAG &DAG) {
18493 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
18494 auto isRoundModeCurDirection = [](SDValue Rnd) {
18495 if (!isa<ConstantSDNode>(Rnd))
18498 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
18499 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
18503 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18504 MVT VT = Op.getSimpleValueType();
18505 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
18507 switch(IntrData->Type) {
18508 case INTR_TYPE_1OP:
18509 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
18510 case INTR_TYPE_2OP:
18511 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18513 case INTR_TYPE_3OP:
18514 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18515 Op.getOperand(2), Op.getOperand(3));
18516 case INTR_TYPE_4OP:
18517 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18518 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
18519 case INTR_TYPE_1OP_MASK_RM: {
18520 SDValue Src = Op.getOperand(1);
18521 SDValue PassThru = Op.getOperand(2);
18522 SDValue Mask = Op.getOperand(3);
18523 SDValue RoundingMode;
18524 // We always add rounding mode to the Node.
18525 // If the rounding mode is not specified, we add the
18526 // "current direction" mode.
18527 if (Op.getNumOperands() == 4)
18529 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18531 RoundingMode = Op.getOperand(4);
18532 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
18533 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18535 Mask, PassThru, Subtarget, DAG);
18537 case INTR_TYPE_1OP_MASK: {
18538 SDValue Src = Op.getOperand(1);
18539 SDValue PassThru = Op.getOperand(2);
18540 SDValue Mask = Op.getOperand(3);
18541 // We add rounding mode to the Node when
18542 // - RM Opcode is specified and
18543 // - RM is not "current direction".
18544 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18545 if (IntrWithRoundingModeOpcode != 0) {
18546 SDValue Rnd = Op.getOperand(4);
18547 if (!isRoundModeCurDirection(Rnd)) {
18548 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18549 dl, Op.getValueType(),
18551 Mask, PassThru, Subtarget, DAG);
18554 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
18555 Mask, PassThru, Subtarget, DAG);
18557 case INTR_TYPE_SCALAR_MASK: {
18558 SDValue Src1 = Op.getOperand(1);
18559 SDValue Src2 = Op.getOperand(2);
18560 SDValue passThru = Op.getOperand(3);
18561 SDValue Mask = Op.getOperand(4);
18562 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
18563 Mask, passThru, Subtarget, DAG);
18565 case INTR_TYPE_SCALAR_MASK_RM: {
18566 SDValue Src1 = Op.getOperand(1);
18567 SDValue Src2 = Op.getOperand(2);
18568 SDValue Src0 = Op.getOperand(3);
18569 SDValue Mask = Op.getOperand(4);
18570 // There are 2 kinds of intrinsics in this group:
18571 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
18572 // (2) With rounding mode and sae - 7 operands.
18573 if (Op.getNumOperands() == 6) {
18574 SDValue Sae = Op.getOperand(5);
18575 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18577 Mask, Src0, Subtarget, DAG);
18579 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
18580 SDValue RoundingMode = Op.getOperand(5);
18581 SDValue Sae = Op.getOperand(6);
18582 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18583 RoundingMode, Sae),
18584 Mask, Src0, Subtarget, DAG);
18586 case INTR_TYPE_2OP_MASK:
18587 case INTR_TYPE_2OP_IMM8_MASK: {
18588 SDValue Src1 = Op.getOperand(1);
18589 SDValue Src2 = Op.getOperand(2);
18590 SDValue PassThru = Op.getOperand(3);
18591 SDValue Mask = Op.getOperand(4);
18593 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
18594 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
18596 // We specify 2 possible opcodes for intrinsics with rounding modes.
18597 // First, we check if the intrinsic may have non-default rounding mode,
18598 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18599 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18600 if (IntrWithRoundingModeOpcode != 0) {
18601 SDValue Rnd = Op.getOperand(5);
18602 if (!isRoundModeCurDirection(Rnd)) {
18603 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18604 dl, Op.getValueType(),
18606 Mask, PassThru, Subtarget, DAG);
18609 // TODO: Intrinsics should have fast-math-flags to propagate.
18610 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
18611 Mask, PassThru, Subtarget, DAG);
18613 case INTR_TYPE_2OP_MASK_RM: {
18614 SDValue Src1 = Op.getOperand(1);
18615 SDValue Src2 = Op.getOperand(2);
18616 SDValue PassThru = Op.getOperand(3);
18617 SDValue Mask = Op.getOperand(4);
18618 // We specify 2 possible modes for intrinsics, with/without rounding
18620 // First, we check if the intrinsic have rounding mode (6 operands),
18621 // if not, we set rounding mode to "current".
18623 if (Op.getNumOperands() == 6)
18624 Rnd = Op.getOperand(5);
18626 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18627 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18629 Mask, PassThru, Subtarget, DAG);
18631 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
18632 SDValue Src1 = Op.getOperand(1);
18633 SDValue Src2 = Op.getOperand(2);
18634 SDValue Src3 = Op.getOperand(3);
18635 SDValue PassThru = Op.getOperand(4);
18636 SDValue Mask = Op.getOperand(5);
18637 SDValue Sae = Op.getOperand(6);
18639 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
18641 Mask, PassThru, Subtarget, DAG);
18643 case INTR_TYPE_3OP_MASK_RM: {
18644 SDValue Src1 = Op.getOperand(1);
18645 SDValue Src2 = Op.getOperand(2);
18646 SDValue Imm = Op.getOperand(3);
18647 SDValue PassThru = Op.getOperand(4);
18648 SDValue Mask = Op.getOperand(5);
18649 // We specify 2 possible modes for intrinsics, with/without rounding
18651 // First, we check if the intrinsic have rounding mode (7 operands),
18652 // if not, we set rounding mode to "current".
18654 if (Op.getNumOperands() == 7)
18655 Rnd = Op.getOperand(6);
18657 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18658 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18659 Src1, Src2, Imm, Rnd),
18660 Mask, PassThru, Subtarget, DAG);
18662 case INTR_TYPE_3OP_IMM8_MASK:
18663 case INTR_TYPE_3OP_MASK:
18664 case INSERT_SUBVEC: {
18665 SDValue Src1 = Op.getOperand(1);
18666 SDValue Src2 = Op.getOperand(2);
18667 SDValue Src3 = Op.getOperand(3);
18668 SDValue PassThru = Op.getOperand(4);
18669 SDValue Mask = Op.getOperand(5);
18671 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
18672 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
18673 else if (IntrData->Type == INSERT_SUBVEC) {
18674 // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
18675 assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
18676 unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
18677 Imm *= Src2.getSimpleValueType().getVectorNumElements();
18678 Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
18681 // We specify 2 possible opcodes for intrinsics with rounding modes.
18682 // First, we check if the intrinsic may have non-default rounding mode,
18683 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18684 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18685 if (IntrWithRoundingModeOpcode != 0) {
18686 SDValue Rnd = Op.getOperand(6);
18687 if (!isRoundModeCurDirection(Rnd)) {
18688 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18689 dl, Op.getValueType(),
18690 Src1, Src2, Src3, Rnd),
18691 Mask, PassThru, Subtarget, DAG);
18694 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18696 Mask, PassThru, Subtarget, DAG);
18698 case VPERM_2OP_MASK : {
18699 SDValue Src1 = Op.getOperand(1);
18700 SDValue Src2 = Op.getOperand(2);
18701 SDValue PassThru = Op.getOperand(3);
18702 SDValue Mask = Op.getOperand(4);
18704 // Swap Src1 and Src2 in the node creation
18705 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
18706 Mask, PassThru, Subtarget, DAG);
18708 case VPERM_3OP_MASKZ:
18709 case VPERM_3OP_MASK:{
18710 MVT VT = Op.getSimpleValueType();
18711 // Src2 is the PassThru
18712 SDValue Src1 = Op.getOperand(1);
18713 // PassThru needs to be the same type as the destination in order
18714 // to pattern match correctly.
18715 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
18716 SDValue Src3 = Op.getOperand(3);
18717 SDValue Mask = Op.getOperand(4);
18718 SDValue PassThru = SDValue();
18720 // set PassThru element
18721 if (IntrData->Type == VPERM_3OP_MASKZ)
18722 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18726 // Swap Src1 and Src2 in the node creation
18727 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18728 dl, Op.getValueType(),
18730 Mask, PassThru, Subtarget, DAG);
18734 case FMA_OP_MASK: {
18735 SDValue Src1 = Op.getOperand(1);
18736 SDValue Src2 = Op.getOperand(2);
18737 SDValue Src3 = Op.getOperand(3);
18738 SDValue Mask = Op.getOperand(4);
18739 MVT VT = Op.getSimpleValueType();
18740 SDValue PassThru = SDValue();
18742 // set PassThru element
18743 if (IntrData->Type == FMA_OP_MASKZ)
18744 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18745 else if (IntrData->Type == FMA_OP_MASK3)
18750 // We specify 2 possible opcodes for intrinsics with rounding modes.
18751 // First, we check if the intrinsic may have non-default rounding mode,
18752 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18753 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18754 if (IntrWithRoundingModeOpcode != 0) {
18755 SDValue Rnd = Op.getOperand(5);
18756 if (!isRoundModeCurDirection(Rnd))
18757 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18758 dl, Op.getValueType(),
18759 Src1, Src2, Src3, Rnd),
18760 Mask, PassThru, Subtarget, DAG);
18762 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18763 dl, Op.getValueType(),
18765 Mask, PassThru, Subtarget, DAG);
18767 case FMA_OP_SCALAR_MASK:
18768 case FMA_OP_SCALAR_MASK3:
18769 case FMA_OP_SCALAR_MASKZ: {
18770 SDValue Src1 = Op.getOperand(1);
18771 SDValue Src2 = Op.getOperand(2);
18772 SDValue Src3 = Op.getOperand(3);
18773 SDValue Mask = Op.getOperand(4);
18774 MVT VT = Op.getSimpleValueType();
18775 SDValue PassThru = SDValue();
18777 // set PassThru element
18778 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
18779 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18780 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
18785 SDValue Rnd = Op.getOperand(5);
18786 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
18787 Op.getValueType(), Src1, Src2,
18789 Mask, PassThru, Subtarget, DAG);
18791 case TERLOG_OP_MASK:
18792 case TERLOG_OP_MASKZ: {
18793 SDValue Src1 = Op.getOperand(1);
18794 SDValue Src2 = Op.getOperand(2);
18795 SDValue Src3 = Op.getOperand(3);
18796 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
18797 SDValue Mask = Op.getOperand(5);
18798 MVT VT = Op.getSimpleValueType();
18799 SDValue PassThru = Src1;
18800 // Set PassThru element.
18801 if (IntrData->Type == TERLOG_OP_MASKZ)
18802 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18804 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18805 Src1, Src2, Src3, Src4),
18806 Mask, PassThru, Subtarget, DAG);
18809 // ISD::FP_ROUND has a second argument that indicates if the truncation
18810 // does not change the value. Set it to 0 since it can change.
18811 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
18812 DAG.getIntPtrConstant(0, dl));
18813 case CVTPD2PS_MASK: {
18814 SDValue Src = Op.getOperand(1);
18815 SDValue PassThru = Op.getOperand(2);
18816 SDValue Mask = Op.getOperand(3);
18817 // We add rounding mode to the Node when
18818 // - RM Opcode is specified and
18819 // - RM is not "current direction".
18820 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18821 if (IntrWithRoundingModeOpcode != 0) {
18822 SDValue Rnd = Op.getOperand(4);
18823 if (!isRoundModeCurDirection(Rnd)) {
18824 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18825 dl, Op.getValueType(),
18827 Mask, PassThru, Subtarget, DAG);
18830 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
18831 // ISD::FP_ROUND has a second argument that indicates if the truncation
18832 // does not change the value. Set it to 0 since it can change.
18833 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18834 DAG.getIntPtrConstant(0, dl)),
18835 Mask, PassThru, Subtarget, DAG);
18838 // FPclass intrinsics with mask
18839 SDValue Src1 = Op.getOperand(1);
18840 MVT VT = Src1.getSimpleValueType();
18841 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18842 SDValue Imm = Op.getOperand(2);
18843 SDValue Mask = Op.getOperand(3);
18844 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18845 Mask.getSimpleValueType().getSizeInBits());
18846 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
18847 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
18848 DAG.getTargetConstant(0, dl, MaskVT),
18850 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
18851 DAG.getUNDEF(BitcastVT), FPclassMask,
18852 DAG.getIntPtrConstant(0, dl));
18853 return DAG.getBitcast(Op.getValueType(), Res);
18856 SDValue Src1 = Op.getOperand(1);
18857 SDValue Imm = Op.getOperand(2);
18858 SDValue Mask = Op.getOperand(3);
18859 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
18860 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
18861 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
18862 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
18865 case CMP_MASK_CC: {
18866 // Comparison intrinsics with masks.
18867 // Example of transformation:
18868 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
18869 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
18871 // (v8i1 (insert_subvector undef,
18872 // (v2i1 (and (PCMPEQM %a, %b),
18873 // (extract_subvector
18874 // (v8i1 (bitcast %mask)), 0))), 0))))
18875 MVT VT = Op.getOperand(1).getSimpleValueType();
18876 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18877 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
18878 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18879 Mask.getSimpleValueType().getSizeInBits());
18881 if (IntrData->Type == CMP_MASK_CC) {
18882 SDValue CC = Op.getOperand(3);
18883 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
18884 // We specify 2 possible opcodes for intrinsics with rounding modes.
18885 // First, we check if the intrinsic may have non-default rounding mode,
18886 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18887 if (IntrData->Opc1 != 0) {
18888 SDValue Rnd = Op.getOperand(5);
18889 if (!isRoundModeCurDirection(Rnd))
18890 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
18891 Op.getOperand(2), CC, Rnd);
18893 //default rounding mode
18895 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
18896 Op.getOperand(2), CC);
18899 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
18900 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
18903 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
18904 DAG.getTargetConstant(0, dl,
18907 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
18908 DAG.getUNDEF(BitcastVT), CmpMask,
18909 DAG.getIntPtrConstant(0, dl));
18910 return DAG.getBitcast(Op.getValueType(), Res);
18912 case CMP_MASK_SCALAR_CC: {
18913 SDValue Src1 = Op.getOperand(1);
18914 SDValue Src2 = Op.getOperand(2);
18915 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
18916 SDValue Mask = Op.getOperand(4);
18919 if (IntrData->Opc1 != 0) {
18920 SDValue Rnd = Op.getOperand(5);
18921 if (!isRoundModeCurDirection(Rnd))
18922 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
18924 //default rounding mode
18926 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
18928 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
18929 DAG.getTargetConstant(0, dl,
18933 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
18935 case COMI: { // Comparison intrinsics
18936 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
18937 SDValue LHS = Op.getOperand(1);
18938 SDValue RHS = Op.getOperand(2);
18939 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
18940 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
18943 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
18944 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
18945 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
18946 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
18949 case ISD::SETNE: { // (ZF = 1 or PF = 1)
18950 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
18951 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
18952 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
18955 case ISD::SETGT: // (CF = 0 and ZF = 0)
18956 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
18958 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
18959 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
18962 case ISD::SETGE: // CF = 0
18963 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
18965 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
18966 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
18969 llvm_unreachable("Unexpected illegal condition!");
18971 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18973 case COMI_RM: { // Comparison intrinsics with Sae
18974 SDValue LHS = Op.getOperand(1);
18975 SDValue RHS = Op.getOperand(2);
18976 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
18977 SDValue Sae = Op.getOperand(4);
18980 if (isRoundModeCurDirection(Sae))
18981 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
18982 DAG.getConstant(CondVal, dl, MVT::i8));
18984 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
18985 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
18986 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
18987 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
18990 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
18991 Op.getOperand(1), Op.getOperand(2), DAG);
18992 case COMPRESS_EXPAND_IN_REG: {
18993 SDValue Mask = Op.getOperand(3);
18994 SDValue DataToCompress = Op.getOperand(1);
18995 SDValue PassThru = Op.getOperand(2);
18996 if (isAllOnesConstant(Mask)) // return data as is
18997 return Op.getOperand(1);
18999 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19001 Mask, PassThru, Subtarget, DAG);
19004 SDValue Mask = Op.getOperand(1);
19005 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19006 Mask.getSimpleValueType().getSizeInBits());
19007 Mask = DAG.getBitcast(MaskVT, Mask);
19008 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19011 MVT VT = Op.getSimpleValueType();
19012 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19014 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19015 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19016 // Arguments should be swapped.
19017 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19018 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19020 return DAG.getBitcast(VT, Res);
19023 case FIXUPIMMS_MASKZ:
19025 case FIXUPIMM_MASKZ:{
19026 SDValue Src1 = Op.getOperand(1);
19027 SDValue Src2 = Op.getOperand(2);
19028 SDValue Src3 = Op.getOperand(3);
19029 SDValue Imm = Op.getOperand(4);
19030 SDValue Mask = Op.getOperand(5);
19031 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19032 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19033 // We specify 2 possible modes for intrinsics, with/without rounding
19035 // First, we check if the intrinsic have rounding mode (7 operands),
19036 // if not, we set rounding mode to "current".
19038 if (Op.getNumOperands() == 7)
19039 Rnd = Op.getOperand(6);
19041 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19042 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19043 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19044 Src1, Src2, Src3, Imm, Rnd),
19045 Mask, Passthru, Subtarget, DAG);
19046 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19047 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19048 Src1, Src2, Src3, Imm, Rnd),
19049 Mask, Passthru, Subtarget, DAG);
19051 case CONVERT_TO_MASK: {
19052 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19053 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19054 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19056 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19058 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19059 DAG.getUNDEF(BitcastVT), CvtMask,
19060 DAG.getIntPtrConstant(0, dl));
19061 return DAG.getBitcast(Op.getValueType(), Res);
19063 case CONVERT_MASK_TO_VEC: {
19064 SDValue Mask = Op.getOperand(1);
19065 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19066 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19067 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19069 case BRCST_SUBVEC_TO_VEC: {
19070 SDValue Src = Op.getOperand(1);
19071 SDValue Passthru = Op.getOperand(2);
19072 SDValue Mask = Op.getOperand(3);
19073 EVT resVT = Passthru.getValueType();
19074 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19075 DAG.getUNDEF(resVT), Src,
19076 DAG.getIntPtrConstant(0, dl));
19078 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19079 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19081 immVal = DAG.getConstant(0, dl, MVT::i8);
19082 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19083 subVec, subVec, immVal),
19084 Mask, Passthru, Subtarget, DAG);
19086 case BRCST32x2_TO_VEC: {
19087 SDValue Src = Op.getOperand(1);
19088 SDValue PassThru = Op.getOperand(2);
19089 SDValue Mask = Op.getOperand(3);
19091 assert((VT.getScalarType() == MVT::i32 ||
19092 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19093 //bitcast Src to packed 64
19094 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19095 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19096 Src = DAG.getBitcast(BitcastVT, Src);
19098 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19099 Mask, PassThru, Subtarget, DAG);
19107 default: return SDValue(); // Don't custom lower most intrinsics.
19109 case Intrinsic::x86_avx2_permd:
19110 case Intrinsic::x86_avx2_permps:
19111 // Operands intentionally swapped. Mask is last operand to intrinsic,
19112 // but second operand for node/instruction.
19113 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19114 Op.getOperand(2), Op.getOperand(1));
19116 // ptest and testp intrinsics. The intrinsic these come from are designed to
19117 // return an integer value, not just an instruction so lower it to the ptest
19118 // or testp pattern and a setcc for the result.
19119 case Intrinsic::x86_sse41_ptestz:
19120 case Intrinsic::x86_sse41_ptestc:
19121 case Intrinsic::x86_sse41_ptestnzc:
19122 case Intrinsic::x86_avx_ptestz_256:
19123 case Intrinsic::x86_avx_ptestc_256:
19124 case Intrinsic::x86_avx_ptestnzc_256:
19125 case Intrinsic::x86_avx_vtestz_ps:
19126 case Intrinsic::x86_avx_vtestc_ps:
19127 case Intrinsic::x86_avx_vtestnzc_ps:
19128 case Intrinsic::x86_avx_vtestz_pd:
19129 case Intrinsic::x86_avx_vtestc_pd:
19130 case Intrinsic::x86_avx_vtestnzc_pd:
19131 case Intrinsic::x86_avx_vtestz_ps_256:
19132 case Intrinsic::x86_avx_vtestc_ps_256:
19133 case Intrinsic::x86_avx_vtestnzc_ps_256:
19134 case Intrinsic::x86_avx_vtestz_pd_256:
19135 case Intrinsic::x86_avx_vtestc_pd_256:
19136 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19137 bool IsTestPacked = false;
19138 X86::CondCode X86CC;
19140 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19141 case Intrinsic::x86_avx_vtestz_ps:
19142 case Intrinsic::x86_avx_vtestz_pd:
19143 case Intrinsic::x86_avx_vtestz_ps_256:
19144 case Intrinsic::x86_avx_vtestz_pd_256:
19145 IsTestPacked = true;
19147 case Intrinsic::x86_sse41_ptestz:
19148 case Intrinsic::x86_avx_ptestz_256:
19150 X86CC = X86::COND_E;
19152 case Intrinsic::x86_avx_vtestc_ps:
19153 case Intrinsic::x86_avx_vtestc_pd:
19154 case Intrinsic::x86_avx_vtestc_ps_256:
19155 case Intrinsic::x86_avx_vtestc_pd_256:
19156 IsTestPacked = true;
19158 case Intrinsic::x86_sse41_ptestc:
19159 case Intrinsic::x86_avx_ptestc_256:
19161 X86CC = X86::COND_B;
19163 case Intrinsic::x86_avx_vtestnzc_ps:
19164 case Intrinsic::x86_avx_vtestnzc_pd:
19165 case Intrinsic::x86_avx_vtestnzc_ps_256:
19166 case Intrinsic::x86_avx_vtestnzc_pd_256:
19167 IsTestPacked = true;
19169 case Intrinsic::x86_sse41_ptestnzc:
19170 case Intrinsic::x86_avx_ptestnzc_256:
19172 X86CC = X86::COND_A;
19176 SDValue LHS = Op.getOperand(1);
19177 SDValue RHS = Op.getOperand(2);
19178 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19179 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19180 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19181 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19183 case Intrinsic::x86_avx512_kortestz_w:
19184 case Intrinsic::x86_avx512_kortestc_w: {
19185 X86::CondCode X86CC =
19186 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19187 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19188 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19189 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19190 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19191 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19194 case Intrinsic::x86_sse42_pcmpistria128:
19195 case Intrinsic::x86_sse42_pcmpestria128:
19196 case Intrinsic::x86_sse42_pcmpistric128:
19197 case Intrinsic::x86_sse42_pcmpestric128:
19198 case Intrinsic::x86_sse42_pcmpistrio128:
19199 case Intrinsic::x86_sse42_pcmpestrio128:
19200 case Intrinsic::x86_sse42_pcmpistris128:
19201 case Intrinsic::x86_sse42_pcmpestris128:
19202 case Intrinsic::x86_sse42_pcmpistriz128:
19203 case Intrinsic::x86_sse42_pcmpestriz128: {
19205 X86::CondCode X86CC;
19207 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19208 case Intrinsic::x86_sse42_pcmpistria128:
19209 Opcode = X86ISD::PCMPISTRI;
19210 X86CC = X86::COND_A;
19212 case Intrinsic::x86_sse42_pcmpestria128:
19213 Opcode = X86ISD::PCMPESTRI;
19214 X86CC = X86::COND_A;
19216 case Intrinsic::x86_sse42_pcmpistric128:
19217 Opcode = X86ISD::PCMPISTRI;
19218 X86CC = X86::COND_B;
19220 case Intrinsic::x86_sse42_pcmpestric128:
19221 Opcode = X86ISD::PCMPESTRI;
19222 X86CC = X86::COND_B;
19224 case Intrinsic::x86_sse42_pcmpistrio128:
19225 Opcode = X86ISD::PCMPISTRI;
19226 X86CC = X86::COND_O;
19228 case Intrinsic::x86_sse42_pcmpestrio128:
19229 Opcode = X86ISD::PCMPESTRI;
19230 X86CC = X86::COND_O;
19232 case Intrinsic::x86_sse42_pcmpistris128:
19233 Opcode = X86ISD::PCMPISTRI;
19234 X86CC = X86::COND_S;
19236 case Intrinsic::x86_sse42_pcmpestris128:
19237 Opcode = X86ISD::PCMPESTRI;
19238 X86CC = X86::COND_S;
19240 case Intrinsic::x86_sse42_pcmpistriz128:
19241 Opcode = X86ISD::PCMPISTRI;
19242 X86CC = X86::COND_E;
19244 case Intrinsic::x86_sse42_pcmpestriz128:
19245 Opcode = X86ISD::PCMPESTRI;
19246 X86CC = X86::COND_E;
19249 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19250 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19251 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19252 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19253 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19256 case Intrinsic::x86_sse42_pcmpistri128:
19257 case Intrinsic::x86_sse42_pcmpestri128: {
19259 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19260 Opcode = X86ISD::PCMPISTRI;
19262 Opcode = X86ISD::PCMPESTRI;
19264 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19265 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19266 return DAG.getNode(Opcode, dl, VTs, NewOps);
19269 case Intrinsic::eh_sjlj_lsda: {
19270 MachineFunction &MF = DAG.getMachineFunction();
19271 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19272 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19273 auto &Context = MF.getMMI().getContext();
19274 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19275 Twine(MF.getFunctionNumber()));
19276 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19279 case Intrinsic::x86_seh_lsda: {
19280 // Compute the symbol for the LSDA. We know it'll get emitted later.
19281 MachineFunction &MF = DAG.getMachineFunction();
19282 SDValue Op1 = Op.getOperand(1);
19283 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19284 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19285 GlobalValue::getRealLinkageName(Fn->getName()));
19287 // Generate a simple absolute symbol reference. This intrinsic is only
19288 // supported on 32-bit Windows, which isn't PIC.
19289 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19290 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19293 case Intrinsic::x86_seh_recoverfp: {
19294 SDValue FnOp = Op.getOperand(1);
19295 SDValue IncomingFPOp = Op.getOperand(2);
19296 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19297 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19299 report_fatal_error(
19300 "llvm.x86.seh.recoverfp must take a function as the first argument");
19301 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19304 case Intrinsic::localaddress: {
19305 // Returns one of the stack, base, or frame pointer registers, depending on
19306 // which is used to reference local variables.
19307 MachineFunction &MF = DAG.getMachineFunction();
19308 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19310 if (RegInfo->hasBasePointer(MF))
19311 Reg = RegInfo->getBaseRegister();
19312 else // This function handles the SP or FP case.
19313 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19314 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
19319 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19320 SDValue Src, SDValue Mask, SDValue Base,
19321 SDValue Index, SDValue ScaleOp, SDValue Chain,
19322 const X86Subtarget &Subtarget) {
19324 auto *C = cast<ConstantSDNode>(ScaleOp);
19325 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19326 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19327 Index.getSimpleValueType().getVectorNumElements());
19329 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19330 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
19331 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19332 SDValue Segment = DAG.getRegister(0, MVT::i32);
19334 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
19335 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
19336 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19337 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
19338 return DAG.getMergeValues(RetOps, dl);
19341 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19342 SDValue Src, SDValue Mask, SDValue Base,
19343 SDValue Index, SDValue ScaleOp, SDValue Chain,
19344 const X86Subtarget &Subtarget) {
19346 auto *C = cast<ConstantSDNode>(ScaleOp);
19347 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19348 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19349 SDValue Segment = DAG.getRegister(0, MVT::i32);
19350 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19351 Index.getSimpleValueType().getVectorNumElements());
19353 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19354 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
19355 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
19356 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19357 return SDValue(Res, 1);
19360 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19361 SDValue Mask, SDValue Base, SDValue Index,
19362 SDValue ScaleOp, SDValue Chain,
19363 const X86Subtarget &Subtarget) {
19365 auto *C = cast<ConstantSDNode>(ScaleOp);
19366 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19367 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19368 SDValue Segment = DAG.getRegister(0, MVT::i32);
19370 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
19371 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19372 //SDVTList VTs = DAG.getVTList(MVT::Other);
19373 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
19374 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
19375 return SDValue(Res, 0);
19378 /// Handles the lowering of builtin intrinsic that return the value
19379 /// of the extended control register.
19380 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
19382 const X86Subtarget &Subtarget,
19383 SmallVectorImpl<SDValue> &Results) {
19384 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19385 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19388 // The ECX register is used to select the index of the XCR register to
19391 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
19392 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
19393 Chain = SDValue(N1, 0);
19395 // Reads the content of XCR and returns it in registers EDX:EAX.
19396 if (Subtarget.is64Bit()) {
19397 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
19398 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19401 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
19402 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19405 Chain = HI.getValue(1);
19407 if (Subtarget.is64Bit()) {
19408 // Merge the two 32-bit values into a 64-bit one..
19409 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19410 DAG.getConstant(32, DL, MVT::i8));
19411 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19412 Results.push_back(Chain);
19416 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19417 SDValue Ops[] = { LO, HI };
19418 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19419 Results.push_back(Pair);
19420 Results.push_back(Chain);
19423 /// Handles the lowering of builtin intrinsics that read performance monitor
19424 /// counters (x86_rdpmc).
19425 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
19427 const X86Subtarget &Subtarget,
19428 SmallVectorImpl<SDValue> &Results) {
19429 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19430 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19433 // The ECX register is used to select the index of the performance counter
19435 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
19437 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
19439 // Reads the content of a 64-bit performance counter and returns it in the
19440 // registers EDX:EAX.
19441 if (Subtarget.is64Bit()) {
19442 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19443 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19446 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19447 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19450 Chain = HI.getValue(1);
19452 if (Subtarget.is64Bit()) {
19453 // The EAX register is loaded with the low-order 32 bits. The EDX register
19454 // is loaded with the supported high-order bits of the counter.
19455 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19456 DAG.getConstant(32, DL, MVT::i8));
19457 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19458 Results.push_back(Chain);
19462 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19463 SDValue Ops[] = { LO, HI };
19464 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19465 Results.push_back(Pair);
19466 Results.push_back(Chain);
19469 /// Handles the lowering of builtin intrinsics that read the time stamp counter
19470 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
19471 /// READCYCLECOUNTER nodes.
19472 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
19474 const X86Subtarget &Subtarget,
19475 SmallVectorImpl<SDValue> &Results) {
19476 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19477 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
19480 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
19481 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
19482 // and the EAX register is loaded with the low-order 32 bits.
19483 if (Subtarget.is64Bit()) {
19484 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19485 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19488 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19489 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19492 SDValue Chain = HI.getValue(1);
19494 if (Opcode == X86ISD::RDTSCP_DAG) {
19495 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19497 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
19498 // the ECX register. Add 'ecx' explicitly to the chain.
19499 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
19501 // Explicitly store the content of ECX at the location passed in input
19502 // to the 'rdtscp' intrinsic.
19503 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
19504 MachinePointerInfo());
19507 if (Subtarget.is64Bit()) {
19508 // The EDX register is loaded with the high-order 32 bits of the MSR, and
19509 // the EAX register is loaded with the low-order 32 bits.
19510 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19511 DAG.getConstant(32, DL, MVT::i8));
19512 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19513 Results.push_back(Chain);
19517 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19518 SDValue Ops[] = { LO, HI };
19519 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19520 Results.push_back(Pair);
19521 Results.push_back(Chain);
19524 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
19525 SelectionDAG &DAG) {
19526 SmallVector<SDValue, 2> Results;
19528 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
19530 return DAG.getMergeValues(Results, DL);
19533 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
19534 MachineFunction &MF = DAG.getMachineFunction();
19535 SDValue Chain = Op.getOperand(0);
19536 SDValue RegNode = Op.getOperand(2);
19537 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19539 report_fatal_error("EH registrations only live in functions using WinEH");
19541 // Cast the operand to an alloca, and remember the frame index.
19542 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
19544 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
19545 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
19547 // Return the chain operand without making any DAG nodes.
19551 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
19552 MachineFunction &MF = DAG.getMachineFunction();
19553 SDValue Chain = Op.getOperand(0);
19554 SDValue EHGuard = Op.getOperand(2);
19555 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19557 report_fatal_error("EHGuard only live in functions using WinEH");
19559 // Cast the operand to an alloca, and remember the frame index.
19560 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
19562 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
19563 EHInfo->EHGuardFrameIndex = FINode->getIndex();
19565 // Return the chain operand without making any DAG nodes.
19569 /// Emit Truncating Store with signed or unsigned saturation.
19571 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
19572 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
19573 SelectionDAG &DAG) {
19575 SDVTList VTs = DAG.getVTList(MVT::Other);
19576 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
19577 SDValue Ops[] = { Chain, Val, Ptr, Undef };
19579 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19580 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19583 /// Emit Masked Truncating Store with signed or unsigned saturation.
19585 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
19586 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
19587 MachineMemOperand *MMO, SelectionDAG &DAG) {
19589 SDVTList VTs = DAG.getVTList(MVT::Other);
19590 SDValue Ops[] = { Chain, Ptr, Mask, Val };
19592 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19593 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19596 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19597 SelectionDAG &DAG) {
19598 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
19600 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
19602 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
19603 return MarkEHRegistrationNode(Op, DAG);
19604 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
19605 return MarkEHGuard(Op, DAG);
19606 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
19607 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
19608 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
19609 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
19610 // We need a frame pointer because this will get lowered to a PUSH/POP
19612 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19613 MFI.setHasCopyImplyingStackAdjustment(true);
19614 // Don't do anything here, we will expand these intrinsics out later
19615 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
19622 switch(IntrData->Type) {
19623 default: llvm_unreachable("Unknown Intrinsic Type");
19626 // Emit the node with the right value type.
19627 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
19628 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19630 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
19631 // Otherwise return the value from Rand, which is always 0, casted to i32.
19632 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
19633 DAG.getConstant(1, dl, Op->getValueType(1)),
19634 DAG.getConstant(X86::COND_B, dl, MVT::i32),
19635 SDValue(Result.getNode(), 1) };
19636 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
19637 DAG.getVTList(Op->getValueType(1), MVT::Glue),
19640 // Return { result, isValid, chain }.
19641 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
19642 SDValue(Result.getNode(), 2));
19645 //gather(v1, mask, index, base, scale);
19646 SDValue Chain = Op.getOperand(0);
19647 SDValue Src = Op.getOperand(2);
19648 SDValue Base = Op.getOperand(3);
19649 SDValue Index = Op.getOperand(4);
19650 SDValue Mask = Op.getOperand(5);
19651 SDValue Scale = Op.getOperand(6);
19652 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
19656 //scatter(base, mask, index, v1, scale);
19657 SDValue Chain = Op.getOperand(0);
19658 SDValue Base = Op.getOperand(2);
19659 SDValue Mask = Op.getOperand(3);
19660 SDValue Index = Op.getOperand(4);
19661 SDValue Src = Op.getOperand(5);
19662 SDValue Scale = Op.getOperand(6);
19663 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
19664 Scale, Chain, Subtarget);
19667 SDValue Hint = Op.getOperand(6);
19668 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
19669 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
19670 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
19671 SDValue Chain = Op.getOperand(0);
19672 SDValue Mask = Op.getOperand(2);
19673 SDValue Index = Op.getOperand(3);
19674 SDValue Base = Op.getOperand(4);
19675 SDValue Scale = Op.getOperand(5);
19676 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
19679 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
19681 SmallVector<SDValue, 2> Results;
19682 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
19684 return DAG.getMergeValues(Results, dl);
19686 // Read Performance Monitoring Counters.
19688 SmallVector<SDValue, 2> Results;
19689 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
19690 return DAG.getMergeValues(Results, dl);
19692 // Get Extended Control Register.
19694 SmallVector<SDValue, 2> Results;
19695 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
19696 return DAG.getMergeValues(Results, dl);
19698 // XTEST intrinsics.
19700 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19701 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19703 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
19704 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
19705 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
19706 Ret, SDValue(InTrans.getNode(), 1));
19710 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19711 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
19712 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
19713 DAG.getConstant(-1, dl, MVT::i8));
19714 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
19715 Op.getOperand(4), GenCF.getValue(1));
19716 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
19717 Op.getOperand(5), MachinePointerInfo());
19718 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
19719 SDValue Results[] = { SetCC, Store };
19720 return DAG.getMergeValues(Results, dl);
19722 case COMPRESS_TO_MEM: {
19723 SDValue Mask = Op.getOperand(4);
19724 SDValue DataToCompress = Op.getOperand(3);
19725 SDValue Addr = Op.getOperand(2);
19726 SDValue Chain = Op.getOperand(0);
19727 MVT VT = DataToCompress.getSimpleValueType();
19729 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19730 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19732 if (isAllOnesConstant(Mask)) // return just a store
19733 return DAG.getStore(Chain, dl, DataToCompress, Addr,
19734 MemIntr->getMemOperand());
19736 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19737 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19739 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
19740 MemIntr->getMemOperand(),
19741 false /* truncating */, true /* compressing */);
19743 case TRUNCATE_TO_MEM_VI8:
19744 case TRUNCATE_TO_MEM_VI16:
19745 case TRUNCATE_TO_MEM_VI32: {
19746 SDValue Mask = Op.getOperand(4);
19747 SDValue DataToTruncate = Op.getOperand(3);
19748 SDValue Addr = Op.getOperand(2);
19749 SDValue Chain = Op.getOperand(0);
19751 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19752 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19754 EVT MemVT = MemIntr->getMemoryVT();
19756 uint16_t TruncationOp = IntrData->Opc0;
19757 switch (TruncationOp) {
19758 case X86ISD::VTRUNC: {
19759 if (isAllOnesConstant(Mask)) // return just a truncate store
19760 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
19761 MemIntr->getMemOperand());
19763 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19764 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19766 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
19767 MemIntr->getMemOperand(), true /* truncating */);
19769 case X86ISD::VTRUNCUS:
19770 case X86ISD::VTRUNCS: {
19771 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
19772 if (isAllOnesConstant(Mask))
19773 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
19774 MemIntr->getMemOperand(), DAG);
19776 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19777 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19779 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
19780 VMask, MemVT, MemIntr->getMemOperand(), DAG);
19783 llvm_unreachable("Unsupported truncstore intrinsic");
19787 case EXPAND_FROM_MEM: {
19788 SDValue Mask = Op.getOperand(4);
19789 SDValue PassThru = Op.getOperand(3);
19790 SDValue Addr = Op.getOperand(2);
19791 SDValue Chain = Op.getOperand(0);
19792 MVT VT = Op.getSimpleValueType();
19794 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19795 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19797 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
19798 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
19799 if (X86::isZeroNode(Mask))
19800 return DAG.getUNDEF(VT);
19802 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19803 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19804 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
19805 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
19806 true /* expanding */);
19811 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
19812 SelectionDAG &DAG) const {
19813 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19814 MFI.setReturnAddressIsTaken(true);
19816 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
19819 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19821 EVT PtrVT = getPointerTy(DAG.getDataLayout());
19824 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
19825 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19826 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
19827 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
19828 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
19829 MachinePointerInfo());
19832 // Just load the return address.
19833 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
19834 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
19835 MachinePointerInfo());
19838 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
19839 SelectionDAG &DAG) const {
19840 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
19841 return getReturnAddressFrameIndex(DAG);
19844 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
19845 MachineFunction &MF = DAG.getMachineFunction();
19846 MachineFrameInfo &MFI = MF.getFrameInfo();
19847 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19848 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19849 EVT VT = Op.getValueType();
19851 MFI.setFrameAddressIsTaken(true);
19853 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
19854 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
19855 // is not possible to crawl up the stack without looking at the unwind codes
19857 int FrameAddrIndex = FuncInfo->getFAIndex();
19858 if (!FrameAddrIndex) {
19859 // Set up a frame object for the return address.
19860 unsigned SlotSize = RegInfo->getSlotSize();
19861 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
19862 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
19863 FuncInfo->setFAIndex(FrameAddrIndex);
19865 return DAG.getFrameIndex(FrameAddrIndex, VT);
19868 unsigned FrameReg =
19869 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
19870 SDLoc dl(Op); // FIXME probably not meaningful
19871 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19872 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
19873 (FrameReg == X86::EBP && VT == MVT::i32)) &&
19874 "Invalid Frame Register!");
19875 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
19877 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
19878 MachinePointerInfo());
19882 // FIXME? Maybe this could be a TableGen attribute on some registers and
19883 // this table could be generated automatically from RegInfo.
19884 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
19885 SelectionDAG &DAG) const {
19886 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19887 const MachineFunction &MF = DAG.getMachineFunction();
19889 unsigned Reg = StringSwitch<unsigned>(RegName)
19890 .Case("esp", X86::ESP)
19891 .Case("rsp", X86::RSP)
19892 .Case("ebp", X86::EBP)
19893 .Case("rbp", X86::RBP)
19896 if (Reg == X86::EBP || Reg == X86::RBP) {
19897 if (!TFI.hasFP(MF))
19898 report_fatal_error("register " + StringRef(RegName) +
19899 " is allocatable: function has no frame pointer");
19902 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19903 unsigned FrameReg =
19904 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
19905 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
19906 "Invalid Frame Register!");
19914 report_fatal_error("Invalid register name global variable");
19917 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
19918 SelectionDAG &DAG) const {
19919 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19920 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
19923 unsigned X86TargetLowering::getExceptionPointerRegister(
19924 const Constant *PersonalityFn) const {
19925 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
19926 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
19928 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
19931 unsigned X86TargetLowering::getExceptionSelectorRegister(
19932 const Constant *PersonalityFn) const {
19933 // Funclet personalities don't use selectors (the runtime does the selection).
19934 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
19935 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
19938 bool X86TargetLowering::needsFixedCatchObjects() const {
19939 return Subtarget.isTargetWin64();
19942 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
19943 SDValue Chain = Op.getOperand(0);
19944 SDValue Offset = Op.getOperand(1);
19945 SDValue Handler = Op.getOperand(2);
19948 EVT PtrVT = getPointerTy(DAG.getDataLayout());
19949 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19950 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
19951 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
19952 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
19953 "Invalid Frame Register!");
19954 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
19955 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
19957 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
19958 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
19960 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
19961 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
19962 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
19964 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
19965 DAG.getRegister(StoreAddrReg, PtrVT));
19968 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
19969 SelectionDAG &DAG) const {
19971 // If the subtarget is not 64bit, we may need the global base reg
19972 // after isel expand pseudo, i.e., after CGBR pass ran.
19973 // Therefore, ask for the GlobalBaseReg now, so that the pass
19974 // inserts the code for us in case we need it.
19975 // Otherwise, we will end up in a situation where we will
19976 // reference a virtual register that is not defined!
19977 if (!Subtarget.is64Bit()) {
19978 const X86InstrInfo *TII = Subtarget.getInstrInfo();
19979 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
19981 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
19982 DAG.getVTList(MVT::i32, MVT::Other),
19983 Op.getOperand(0), Op.getOperand(1));
19986 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
19987 SelectionDAG &DAG) const {
19989 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
19990 Op.getOperand(0), Op.getOperand(1));
19993 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
19994 SelectionDAG &DAG) const {
19996 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20000 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20001 return Op.getOperand(0);
20004 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20005 SelectionDAG &DAG) const {
20006 SDValue Root = Op.getOperand(0);
20007 SDValue Trmp = Op.getOperand(1); // trampoline
20008 SDValue FPtr = Op.getOperand(2); // nested function
20009 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20012 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20013 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20015 if (Subtarget.is64Bit()) {
20016 SDValue OutChains[6];
20018 // Large code-model.
20019 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20020 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20022 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20023 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20025 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20027 // Load the pointer to the nested function into R11.
20028 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20029 SDValue Addr = Trmp;
20030 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20031 Addr, MachinePointerInfo(TrmpAddr));
20033 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20034 DAG.getConstant(2, dl, MVT::i64));
20036 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20037 /* Alignment = */ 2);
20039 // Load the 'nest' parameter value into R10.
20040 // R10 is specified in X86CallingConv.td
20041 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20042 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20043 DAG.getConstant(10, dl, MVT::i64));
20044 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20045 Addr, MachinePointerInfo(TrmpAddr, 10));
20047 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20048 DAG.getConstant(12, dl, MVT::i64));
20050 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20051 /* Alignment = */ 2);
20053 // Jump to the nested function.
20054 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20055 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20056 DAG.getConstant(20, dl, MVT::i64));
20057 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20058 Addr, MachinePointerInfo(TrmpAddr, 20));
20060 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20061 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20062 DAG.getConstant(22, dl, MVT::i64));
20063 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20064 Addr, MachinePointerInfo(TrmpAddr, 22));
20066 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20068 const Function *Func =
20069 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20070 CallingConv::ID CC = Func->getCallingConv();
20075 llvm_unreachable("Unsupported calling convention");
20076 case CallingConv::C:
20077 case CallingConv::X86_StdCall: {
20078 // Pass 'nest' parameter in ECX.
20079 // Must be kept in sync with X86CallingConv.td
20080 NestReg = X86::ECX;
20082 // Check that ECX wasn't needed by an 'inreg' parameter.
20083 FunctionType *FTy = Func->getFunctionType();
20084 const AttributeSet &Attrs = Func->getAttributes();
20086 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20087 unsigned InRegCount = 0;
20090 for (FunctionType::param_iterator I = FTy->param_begin(),
20091 E = FTy->param_end(); I != E; ++I, ++Idx)
20092 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20093 auto &DL = DAG.getDataLayout();
20094 // FIXME: should only count parameters that are lowered to integers.
20095 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20098 if (InRegCount > 2) {
20099 report_fatal_error("Nest register in use - reduce number of inreg"
20105 case CallingConv::X86_FastCall:
20106 case CallingConv::X86_ThisCall:
20107 case CallingConv::Fast:
20108 // Pass 'nest' parameter in EAX.
20109 // Must be kept in sync with X86CallingConv.td
20110 NestReg = X86::EAX;
20114 SDValue OutChains[4];
20115 SDValue Addr, Disp;
20117 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20118 DAG.getConstant(10, dl, MVT::i32));
20119 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20121 // This is storing the opcode for MOV32ri.
20122 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20123 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20125 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20126 Trmp, MachinePointerInfo(TrmpAddr));
20128 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20129 DAG.getConstant(1, dl, MVT::i32));
20131 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20132 /* Alignment = */ 1);
20134 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20135 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20136 DAG.getConstant(5, dl, MVT::i32));
20137 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20138 Addr, MachinePointerInfo(TrmpAddr, 5),
20139 /* Alignment = */ 1);
20141 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20142 DAG.getConstant(6, dl, MVT::i32));
20144 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20145 /* Alignment = */ 1);
20147 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20151 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20152 SelectionDAG &DAG) const {
20154 The rounding mode is in bits 11:10 of FPSR, and has the following
20156 00 Round to nearest
20161 FLT_ROUNDS, on the other hand, expects the following:
20168 To perform the conversion, we do:
20169 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20172 MachineFunction &MF = DAG.getMachineFunction();
20173 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20174 unsigned StackAlignment = TFI.getStackAlignment();
20175 MVT VT = Op.getSimpleValueType();
20178 // Save FP Control Word to stack slot
20179 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20180 SDValue StackSlot =
20181 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20183 MachineMemOperand *MMO =
20184 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20185 MachineMemOperand::MOStore, 2, 2);
20187 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20188 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20189 DAG.getVTList(MVT::Other),
20190 Ops, MVT::i16, MMO);
20192 // Load FP Control Word from stack slot
20194 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20196 // Transform as necessary
20198 DAG.getNode(ISD::SRL, DL, MVT::i16,
20199 DAG.getNode(ISD::AND, DL, MVT::i16,
20200 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20201 DAG.getConstant(11, DL, MVT::i8));
20203 DAG.getNode(ISD::SRL, DL, MVT::i16,
20204 DAG.getNode(ISD::AND, DL, MVT::i16,
20205 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20206 DAG.getConstant(9, DL, MVT::i8));
20209 DAG.getNode(ISD::AND, DL, MVT::i16,
20210 DAG.getNode(ISD::ADD, DL, MVT::i16,
20211 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20212 DAG.getConstant(1, DL, MVT::i16)),
20213 DAG.getConstant(3, DL, MVT::i16));
20215 return DAG.getNode((VT.getSizeInBits() < 16 ?
20216 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20219 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20221 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
20222 // to 512-bit vector.
20223 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
20224 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20225 // split the vector, perform operation on it's Lo a Hi part and
20226 // concatenate the results.
20227 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
20228 assert(Op.getOpcode() == ISD::CTLZ);
20230 MVT VT = Op.getSimpleValueType();
20231 MVT EltVT = VT.getVectorElementType();
20232 unsigned NumElems = VT.getVectorNumElements();
20234 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
20235 // Extend to 512 bit vector.
20236 assert((VT.is256BitVector() || VT.is128BitVector()) &&
20237 "Unsupported value type for operation");
20239 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
20240 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
20241 DAG.getUNDEF(NewVT),
20243 DAG.getIntPtrConstant(0, dl));
20244 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
20246 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
20247 DAG.getIntPtrConstant(0, dl));
20250 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
20251 "Unsupported element type");
20253 if (16 < NumElems) {
20254 // Split vector, it's Lo and Hi parts will be handled in next iteration.
20256 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
20257 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
20259 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
20260 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
20262 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
20265 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
20267 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
20268 "Unsupported value type for operation");
20270 // Use native supported vector instruction vplzcntd.
20271 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
20272 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
20273 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
20274 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
20276 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
20279 // Lower CTLZ using a PSHUFB lookup table implementation.
20280 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
20281 const X86Subtarget &Subtarget,
20282 SelectionDAG &DAG) {
20283 MVT VT = Op.getSimpleValueType();
20284 int NumElts = VT.getVectorNumElements();
20285 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
20286 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
20288 // Per-nibble leading zero PSHUFB lookup table.
20289 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
20290 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
20291 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
20292 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
20294 SmallVector<SDValue, 64> LUTVec;
20295 for (int i = 0; i < NumBytes; ++i)
20296 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20297 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
20299 // Begin by bitcasting the input to byte vector, then split those bytes
20300 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
20301 // If the hi input nibble is zero then we add both results together, otherwise
20302 // we just take the hi result (by masking the lo result to zero before the
20304 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
20305 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
20307 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
20308 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
20309 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
20310 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
20311 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
20313 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
20314 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
20315 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
20316 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
20318 // Merge result back from vXi8 back to VT, working on the lo/hi halves
20319 // of the current vector width in the same way we did for the nibbles.
20320 // If the upper half of the input element is zero then add the halves'
20321 // leading zero counts together, otherwise just use the upper half's.
20322 // Double the width of the result until we are at target width.
20323 while (CurrVT != VT) {
20324 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
20325 int CurrNumElts = CurrVT.getVectorNumElements();
20326 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
20327 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
20328 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
20330 // Check if the upper half of the input element is zero.
20331 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
20332 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
20333 HiZ = DAG.getBitcast(NextVT, HiZ);
20335 // Move the upper/lower halves to the lower bits as we'll be extending to
20336 // NextVT. Mask the lower result to zero if HiZ is true and add the results
20338 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
20339 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
20340 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
20341 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
20342 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
20349 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
20350 const X86Subtarget &Subtarget,
20351 SelectionDAG &DAG) {
20352 MVT VT = Op.getSimpleValueType();
20353 SDValue Op0 = Op.getOperand(0);
20355 if (Subtarget.hasAVX512())
20356 return LowerVectorCTLZ_AVX512(Op, DAG);
20358 // Decompose 256-bit ops into smaller 128-bit ops.
20359 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20360 unsigned NumElems = VT.getVectorNumElements();
20362 // Extract each 128-bit vector, perform ctlz and concat the result.
20363 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
20364 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
20366 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
20367 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
20368 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
20371 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
20372 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
20375 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
20376 SelectionDAG &DAG) {
20377 MVT VT = Op.getSimpleValueType();
20379 unsigned NumBits = VT.getSizeInBits();
20381 unsigned Opc = Op.getOpcode();
20384 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
20386 Op = Op.getOperand(0);
20387 if (VT == MVT::i8) {
20388 // Zero extend to i32 since there is not an i8 bsr.
20390 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
20393 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
20394 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
20395 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
20397 if (Opc == ISD::CTLZ) {
20398 // If src is zero (i.e. bsr sets ZF), returns NumBits.
20401 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
20402 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20405 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
20408 // Finally xor with NumBits-1.
20409 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
20410 DAG.getConstant(NumBits - 1, dl, OpVT));
20413 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
20417 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
20418 MVT VT = Op.getSimpleValueType();
20419 unsigned NumBits = VT.getScalarSizeInBits();
20422 if (VT.isVector()) {
20423 SDValue N0 = Op.getOperand(0);
20424 SDValue Zero = DAG.getConstant(0, dl, VT);
20426 // lsb(x) = (x & -x)
20427 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
20428 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
20430 // cttz_undef(x) = (width - 1) - ctlz(lsb)
20431 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
20432 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
20433 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
20434 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
20437 // cttz(x) = ctpop(lsb - 1)
20438 SDValue One = DAG.getConstant(1, dl, VT);
20439 return DAG.getNode(ISD::CTPOP, dl, VT,
20440 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
20443 assert(Op.getOpcode() == ISD::CTTZ &&
20444 "Only scalar CTTZ requires custom lowering");
20446 // Issue a bsf (scan bits forward) which also sets EFLAGS.
20447 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
20448 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
20450 // If src is zero (i.e. bsf sets ZF), returns NumBits.
20453 DAG.getConstant(NumBits, dl, VT),
20454 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20457 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
20460 /// Break a 256-bit integer operation into two new 128-bit ones and then
20461 /// concatenate the result back.
20462 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
20463 MVT VT = Op.getSimpleValueType();
20465 assert(VT.is256BitVector() && VT.isInteger() &&
20466 "Unsupported value type for operation");
20468 unsigned NumElems = VT.getVectorNumElements();
20471 // Extract the LHS vectors
20472 SDValue LHS = Op.getOperand(0);
20473 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
20474 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
20476 // Extract the RHS vectors
20477 SDValue RHS = Op.getOperand(1);
20478 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
20479 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
20481 MVT EltVT = VT.getVectorElementType();
20482 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20484 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20485 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20486 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20489 /// Break a 512-bit integer operation into two new 256-bit ones and then
20490 /// concatenate the result back.
20491 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
20492 MVT VT = Op.getSimpleValueType();
20494 assert(VT.is512BitVector() && VT.isInteger() &&
20495 "Unsupported value type for operation");
20497 unsigned NumElems = VT.getVectorNumElements();
20500 // Extract the LHS vectors
20501 SDValue LHS = Op.getOperand(0);
20502 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
20503 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
20505 // Extract the RHS vectors
20506 SDValue RHS = Op.getOperand(1);
20507 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
20508 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
20510 MVT EltVT = VT.getVectorElementType();
20511 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20513 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20514 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20515 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20518 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
20519 if (Op.getValueType() == MVT::i1)
20520 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20521 Op.getOperand(0), Op.getOperand(1));
20522 assert(Op.getSimpleValueType().is256BitVector() &&
20523 Op.getSimpleValueType().isInteger() &&
20524 "Only handle AVX 256-bit vector integer operation");
20525 return Lower256IntArith(Op, DAG);
20528 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
20529 if (Op.getValueType() == MVT::i1)
20530 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20531 Op.getOperand(0), Op.getOperand(1));
20532 assert(Op.getSimpleValueType().is256BitVector() &&
20533 Op.getSimpleValueType().isInteger() &&
20534 "Only handle AVX 256-bit vector integer operation");
20535 return Lower256IntArith(Op, DAG);
20538 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
20539 assert(Op.getSimpleValueType().is256BitVector() &&
20540 Op.getSimpleValueType().isInteger() &&
20541 "Only handle AVX 256-bit vector integer operation");
20542 return Lower256IntArith(Op, DAG);
20545 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
20546 SelectionDAG &DAG) {
20548 MVT VT = Op.getSimpleValueType();
20551 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
20553 // Decompose 256-bit ops into smaller 128-bit ops.
20554 if (VT.is256BitVector() && !Subtarget.hasInt256())
20555 return Lower256IntArith(Op, DAG);
20557 SDValue A = Op.getOperand(0);
20558 SDValue B = Op.getOperand(1);
20560 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
20561 // vector pairs, multiply and truncate.
20562 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
20563 if (Subtarget.hasInt256()) {
20564 // For 512-bit vectors, split into 256-bit vectors to allow the
20565 // sign-extension to occur.
20566 if (VT == MVT::v64i8)
20567 return Lower512IntArith(Op, DAG);
20569 // For 256-bit vectors, split into 128-bit vectors to allow the
20570 // sign-extension to occur. We don't need this on AVX512BW as we can
20571 // safely sign-extend to v32i16.
20572 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
20573 return Lower256IntArith(Op, DAG);
20575 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
20576 return DAG.getNode(
20577 ISD::TRUNCATE, dl, VT,
20578 DAG.getNode(ISD::MUL, dl, ExVT,
20579 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
20580 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
20583 assert(VT == MVT::v16i8 &&
20584 "Pre-AVX2 support only supports v16i8 multiplication");
20585 MVT ExVT = MVT::v8i16;
20587 // Extract the lo parts and sign extend to i16
20589 if (Subtarget.hasSSE41()) {
20590 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
20591 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
20593 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20594 -1, 4, -1, 5, -1, 6, -1, 7};
20595 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20596 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20597 ALo = DAG.getBitcast(ExVT, ALo);
20598 BLo = DAG.getBitcast(ExVT, BLo);
20599 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20600 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20603 // Extract the hi parts and sign extend to i16
20605 if (Subtarget.hasSSE41()) {
20606 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20607 -1, -1, -1, -1, -1, -1, -1, -1};
20608 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20609 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20610 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
20611 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
20613 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20614 -1, 12, -1, 13, -1, 14, -1, 15};
20615 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20616 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20617 AHi = DAG.getBitcast(ExVT, AHi);
20618 BHi = DAG.getBitcast(ExVT, BHi);
20619 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20620 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20623 // Multiply, mask the lower 8bits of the lo/hi results and pack
20624 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20625 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20626 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
20627 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
20628 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20631 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
20632 if (VT == MVT::v4i32) {
20633 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
20634 "Should not custom lower when pmuldq is available!");
20636 // Extract the odd parts.
20637 static const int UnpackMask[] = { 1, -1, 3, -1 };
20638 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
20639 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
20641 // Multiply the even parts.
20642 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
20643 // Now multiply odd parts.
20644 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
20646 Evens = DAG.getBitcast(VT, Evens);
20647 Odds = DAG.getBitcast(VT, Odds);
20649 // Merge the two vectors back together with a shuffle. This expands into 2
20651 static const int ShufMask[] = { 0, 4, 2, 6 };
20652 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
20655 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
20656 "Only know how to lower V2I64/V4I64/V8I64 multiply");
20658 // 32-bit vector types used for MULDQ/MULUDQ.
20659 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20661 // MULDQ returns the 64-bit result of the signed multiplication of the lower
20662 // 32-bits. We can lower with this if the sign bits stretch that far.
20663 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
20664 DAG.ComputeNumSignBits(B) > 32) {
20665 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
20666 DAG.getBitcast(MulVT, B));
20669 // Ahi = psrlqi(a, 32);
20670 // Bhi = psrlqi(b, 32);
20672 // AloBlo = pmuludq(a, b);
20673 // AloBhi = pmuludq(a, Bhi);
20674 // AhiBlo = pmuludq(Ahi, b);
20676 // Hi = psllqi(AloBhi + AhiBlo, 32);
20677 // return AloBlo + Hi;
20678 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
20679 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
20680 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
20682 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
20683 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
20684 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
20686 // Bit cast to 32-bit vectors for MULUDQ.
20687 SDValue Alo = DAG.getBitcast(MulVT, A);
20688 SDValue Blo = DAG.getBitcast(MulVT, B);
20690 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20692 // Only multiply lo/hi halves that aren't known to be zero.
20693 SDValue AloBlo = Zero;
20694 if (!ALoIsZero && !BLoIsZero)
20695 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
20697 SDValue AloBhi = Zero;
20698 if (!ALoIsZero && !BHiIsZero) {
20699 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
20700 Bhi = DAG.getBitcast(MulVT, Bhi);
20701 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
20704 SDValue AhiBlo = Zero;
20705 if (!AHiIsZero && !BLoIsZero) {
20706 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
20707 Ahi = DAG.getBitcast(MulVT, Ahi);
20708 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
20711 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
20712 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
20714 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
20717 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
20718 SelectionDAG &DAG) {
20720 MVT VT = Op.getSimpleValueType();
20722 // Decompose 256-bit ops into smaller 128-bit ops.
20723 if (VT.is256BitVector() && !Subtarget.hasInt256())
20724 return Lower256IntArith(Op, DAG);
20726 // Only i8 vectors should need custom lowering after this.
20727 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
20728 "Unsupported vector type");
20730 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
20731 // logical shift down the upper half and pack back to i8.
20732 SDValue A = Op.getOperand(0);
20733 SDValue B = Op.getOperand(1);
20735 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
20736 // and then ashr/lshr the upper bits down to the lower bits before multiply.
20737 unsigned Opcode = Op.getOpcode();
20738 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
20739 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
20741 // AVX2 implementations - extend xmm subvectors to ymm.
20742 if (Subtarget.hasInt256()) {
20743 SDValue Lo = DAG.getIntPtrConstant(0, dl);
20744 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
20746 if (VT == MVT::v32i8) {
20747 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
20748 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
20749 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
20750 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
20751 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
20752 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
20753 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
20754 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
20755 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20756 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
20757 DAG.getConstant(8, dl, MVT::v16i16));
20758 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20759 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
20760 DAG.getConstant(8, dl, MVT::v16i16));
20761 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
20762 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
20763 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
20764 16, 17, 18, 19, 20, 21, 22, 23};
20765 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20766 24, 25, 26, 27, 28, 29, 30, 31};
20767 return DAG.getNode(X86ISD::PACKUS, dl, VT,
20768 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
20769 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
20772 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
20773 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
20774 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
20775 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
20776 DAG.getConstant(8, dl, MVT::v16i16));
20777 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
20778 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
20779 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20782 assert(VT == MVT::v16i8 &&
20783 "Pre-AVX2 support only supports v16i8 multiplication");
20784 MVT ExVT = MVT::v8i16;
20786 // Extract the lo parts and zero/sign extend to i16.
20788 if (Subtarget.hasSSE41()) {
20789 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
20790 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
20792 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20793 -1, 4, -1, 5, -1, 6, -1, 7};
20794 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20795 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20796 ALo = DAG.getBitcast(ExVT, ALo);
20797 BLo = DAG.getBitcast(ExVT, BLo);
20798 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20799 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20802 // Extract the hi parts and zero/sign extend to i16.
20804 if (Subtarget.hasSSE41()) {
20805 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20806 -1, -1, -1, -1, -1, -1, -1, -1};
20807 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20808 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20809 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
20810 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
20812 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20813 -1, 12, -1, 13, -1, 14, -1, 15};
20814 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20815 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20816 AHi = DAG.getBitcast(ExVT, AHi);
20817 BHi = DAG.getBitcast(ExVT, BHi);
20818 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20819 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20822 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
20823 // pack back to v16i8.
20824 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20825 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20826 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
20827 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
20828 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20831 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
20832 assert(Subtarget.isTargetWin64() && "Unexpected target");
20833 EVT VT = Op.getValueType();
20834 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
20835 "Unexpected return type for lowering");
20839 switch (Op->getOpcode()) {
20840 default: llvm_unreachable("Unexpected request for libcall!");
20841 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
20842 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
20843 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
20844 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
20845 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
20846 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
20850 SDValue InChain = DAG.getEntryNode();
20852 TargetLowering::ArgListTy Args;
20853 TargetLowering::ArgListEntry Entry;
20854 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
20855 EVT ArgVT = Op->getOperand(i).getValueType();
20856 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
20857 "Unexpected argument type for lowering");
20858 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
20859 Entry.Node = StackPtr;
20860 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
20861 MachinePointerInfo(), /* Alignment = */ 16);
20862 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20863 Entry.Ty = PointerType::get(ArgTy,0);
20864 Entry.isSExt = false;
20865 Entry.isZExt = false;
20866 Args.push_back(Entry);
20869 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
20870 getPointerTy(DAG.getDataLayout()));
20872 TargetLowering::CallLoweringInfo CLI(DAG);
20873 CLI.setDebugLoc(dl).setChain(InChain)
20874 .setCallee(getLibcallCallingConv(LC),
20875 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
20876 Callee, std::move(Args))
20877 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
20879 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20880 return DAG.getBitcast(VT, CallInfo.first);
20883 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
20884 SelectionDAG &DAG) {
20885 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
20886 MVT VT = Op0.getSimpleValueType();
20889 // Decompose 256-bit ops into smaller 128-bit ops.
20890 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20891 unsigned Opcode = Op.getOpcode();
20892 unsigned NumElems = VT.getVectorNumElements();
20893 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
20894 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
20895 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
20896 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
20897 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
20898 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
20899 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
20901 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
20902 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
20904 return DAG.getMergeValues(Ops, dl);
20907 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
20908 (VT == MVT::v8i32 && Subtarget.hasInt256()));
20910 // PMULxD operations multiply each even value (starting at 0) of LHS with
20911 // the related value of RHS and produce a widen result.
20912 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
20913 // => <2 x i64> <ae|cg>
20915 // In other word, to have all the results, we need to perform two PMULxD:
20916 // 1. one with the even values.
20917 // 2. one with the odd values.
20918 // To achieve #2, with need to place the odd values at an even position.
20920 // Place the odd value at an even position (basically, shift all values 1
20921 // step to the left):
20922 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
20923 // <a|b|c|d> => <b|undef|d|undef>
20924 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
20925 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
20926 // <e|f|g|h> => <f|undef|h|undef>
20927 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
20928 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
20930 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
20932 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
20933 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
20935 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
20936 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
20937 // => <2 x i64> <ae|cg>
20938 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
20939 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
20940 // => <2 x i64> <bf|dh>
20941 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
20943 // Shuffle it back into the right order.
20944 SDValue Highs, Lows;
20945 if (VT == MVT::v8i32) {
20946 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
20947 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
20948 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
20949 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
20951 const int HighMask[] = {1, 5, 3, 7};
20952 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
20953 const int LowMask[] = {0, 4, 2, 6};
20954 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
20957 // If we have a signed multiply but no PMULDQ fix up the high parts of a
20958 // unsigned multiply.
20959 if (IsSigned && !Subtarget.hasSSE41()) {
20960 SDValue ShAmt = DAG.getConstant(
20962 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
20963 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
20964 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
20965 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
20966 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
20968 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
20969 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
20972 // The first result of MUL_LOHI is actually the low value, followed by the
20974 SDValue Ops[] = {Lows, Highs};
20975 return DAG.getMergeValues(Ops, dl);
20978 // Return true if the required (according to Opcode) shift-imm form is natively
20979 // supported by the Subtarget
20980 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
20982 if (VT.getScalarSizeInBits() < 16)
20985 if (VT.is512BitVector() &&
20986 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
20989 bool LShift = VT.is128BitVector() ||
20990 (VT.is256BitVector() && Subtarget.hasInt256());
20992 bool AShift = LShift && (Subtarget.hasVLX() ||
20993 (VT != MVT::v2i64 && VT != MVT::v4i64));
20994 return (Opcode == ISD::SRA) ? AShift : LShift;
20997 // The shift amount is a variable, but it is the same for all vector lanes.
20998 // These instructions are defined together with shift-immediate.
21000 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21002 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21005 // Return true if the required (according to Opcode) variable-shift form is
21006 // natively supported by the Subtarget
21007 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21010 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21013 // vXi16 supported only on AVX-512, BWI
21014 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21017 if (VT.is512BitVector() || Subtarget.hasVLX())
21020 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21021 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21022 return (Opcode == ISD::SRA) ? AShift : LShift;
21025 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21026 const X86Subtarget &Subtarget) {
21027 MVT VT = Op.getSimpleValueType();
21029 SDValue R = Op.getOperand(0);
21030 SDValue Amt = Op.getOperand(1);
21032 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21033 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21035 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21036 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21037 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21038 SDValue Ex = DAG.getBitcast(ExVT, R);
21040 if (ShiftAmt >= 32) {
21041 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21043 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21044 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21045 ShiftAmt - 32, DAG);
21046 if (VT == MVT::v2i64)
21047 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21048 if (VT == MVT::v4i64)
21049 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21050 {9, 1, 11, 3, 13, 5, 15, 7});
21052 // SRA upper i32, SHL whole i64 and select lower i32.
21053 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21056 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21057 Lower = DAG.getBitcast(ExVT, Lower);
21058 if (VT == MVT::v2i64)
21059 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21060 if (VT == MVT::v4i64)
21061 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21062 {8, 1, 10, 3, 12, 5, 14, 7});
21064 return DAG.getBitcast(VT, Ex);
21067 // Optimize shl/srl/sra with constant shift amount.
21068 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21069 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21070 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21072 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21073 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21075 // i64 SRA needs to be performed as partial shifts.
21076 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21077 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21078 return ArithmeticShiftRight64(ShiftAmt);
21080 if (VT == MVT::v16i8 ||
21081 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21082 VT == MVT::v64i8) {
21083 unsigned NumElts = VT.getVectorNumElements();
21084 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21086 // Simple i8 add case
21087 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21088 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21090 // ashr(R, 7) === cmp_slt(R, 0)
21091 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21092 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21093 if (VT.is512BitVector()) {
21094 assert(VT == MVT::v64i8 && "Unexpected element type!");
21095 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21096 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21098 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21101 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21102 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21105 if (Op.getOpcode() == ISD::SHL) {
21106 // Make a large shift.
21107 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21109 SHL = DAG.getBitcast(VT, SHL);
21110 // Zero out the rightmost bits.
21111 return DAG.getNode(ISD::AND, dl, VT, SHL,
21112 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21114 if (Op.getOpcode() == ISD::SRL) {
21115 // Make a large shift.
21116 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21118 SRL = DAG.getBitcast(VT, SRL);
21119 // Zero out the leftmost bits.
21120 return DAG.getNode(ISD::AND, dl, VT, SRL,
21121 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21123 if (Op.getOpcode() == ISD::SRA) {
21124 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21125 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21127 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21128 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21129 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21132 llvm_unreachable("Unknown shift opcode.");
21137 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21138 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21139 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21140 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21142 // Peek through any splat that was introduced for i64 shift vectorization.
21143 int SplatIndex = -1;
21144 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21145 if (SVN->isSplat()) {
21146 SplatIndex = SVN->getSplatIndex();
21147 Amt = Amt.getOperand(0);
21148 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21149 "Splat shuffle referencing second operand");
21152 if (Amt.getOpcode() != ISD::BITCAST ||
21153 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21156 Amt = Amt.getOperand(0);
21157 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21158 VT.getVectorNumElements();
21159 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21160 uint64_t ShiftAmt = 0;
21161 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21162 for (unsigned i = 0; i != Ratio; ++i) {
21163 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21167 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21170 // Check remaining shift amounts (if not a splat).
21171 if (SplatIndex < 0) {
21172 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21173 uint64_t ShAmt = 0;
21174 for (unsigned j = 0; j != Ratio; ++j) {
21175 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21179 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21181 if (ShAmt != ShiftAmt)
21186 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21187 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21189 if (Op.getOpcode() == ISD::SRA)
21190 return ArithmeticShiftRight64(ShiftAmt);
21196 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21197 const X86Subtarget &Subtarget) {
21198 MVT VT = Op.getSimpleValueType();
21200 SDValue R = Op.getOperand(0);
21201 SDValue Amt = Op.getOperand(1);
21203 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21204 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21206 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21207 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21209 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21211 MVT EltVT = VT.getVectorElementType();
21213 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21214 // Check if this build_vector node is doing a splat.
21215 // If so, then set BaseShAmt equal to the splat value.
21216 BaseShAmt = BV->getSplatValue();
21217 if (BaseShAmt && BaseShAmt.isUndef())
21218 BaseShAmt = SDValue();
21220 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21221 Amt = Amt.getOperand(0);
21223 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21224 if (SVN && SVN->isSplat()) {
21225 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21226 SDValue InVec = Amt.getOperand(0);
21227 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21228 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21229 "Unexpected shuffle index found!");
21230 BaseShAmt = InVec.getOperand(SplatIdx);
21231 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21232 if (ConstantSDNode *C =
21233 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21234 if (C->getZExtValue() == SplatIdx)
21235 BaseShAmt = InVec.getOperand(1);
21240 // Avoid introducing an extract element from a shuffle.
21241 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21242 DAG.getIntPtrConstant(SplatIdx, dl));
21246 if (BaseShAmt.getNode()) {
21247 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
21248 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
21249 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
21250 else if (EltVT.bitsLT(MVT::i32))
21251 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
21253 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
21257 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21258 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
21259 Amt.getOpcode() == ISD::BITCAST &&
21260 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
21261 Amt = Amt.getOperand(0);
21262 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21263 VT.getVectorNumElements();
21264 std::vector<SDValue> Vals(Ratio);
21265 for (unsigned i = 0; i != Ratio; ++i)
21266 Vals[i] = Amt.getOperand(i);
21267 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21268 for (unsigned j = 0; j != Ratio; ++j)
21269 if (Vals[j] != Amt.getOperand(i + j))
21273 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
21274 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
21279 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
21280 SelectionDAG &DAG) {
21281 MVT VT = Op.getSimpleValueType();
21283 SDValue R = Op.getOperand(0);
21284 SDValue Amt = Op.getOperand(1);
21285 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21287 assert(VT.isVector() && "Custom lowering only for vector shifts!");
21288 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
21290 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
21293 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
21296 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
21299 // XOP has 128-bit variable logical/arithmetic shifts.
21300 // +ve/-ve Amt = shift left/right.
21301 if (Subtarget.hasXOP() &&
21302 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
21303 VT == MVT::v8i16 || VT == MVT::v16i8)) {
21304 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
21305 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21306 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
21308 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
21309 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
21310 if (Op.getOpcode() == ISD::SRA)
21311 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
21314 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
21315 // shifts per-lane and then shuffle the partial results back together.
21316 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
21317 // Splat the shift amounts so the scalar shifts above will catch it.
21318 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
21319 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
21320 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
21321 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
21322 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
21325 // i64 vector arithmetic shift can be emulated with the transform:
21326 // M = lshr(SIGN_BIT, Amt)
21327 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
21328 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
21329 Op.getOpcode() == ISD::SRA) {
21330 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
21331 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
21332 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21333 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
21334 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
21338 // If possible, lower this packed shift into a vector multiply instead of
21339 // expanding it into a sequence of scalar shifts.
21340 // Do this only if the vector shift count is a constant build_vector.
21341 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
21342 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
21343 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
21344 SmallVector<SDValue, 8> Elts;
21345 MVT SVT = VT.getVectorElementType();
21346 unsigned SVTBits = SVT.getSizeInBits();
21347 APInt One(SVTBits, 1);
21348 unsigned NumElems = VT.getVectorNumElements();
21350 for (unsigned i=0; i !=NumElems; ++i) {
21351 SDValue Op = Amt->getOperand(i);
21352 if (Op->isUndef()) {
21353 Elts.push_back(Op);
21357 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
21358 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
21359 uint64_t ShAmt = C.getZExtValue();
21360 if (ShAmt >= SVTBits) {
21361 Elts.push_back(DAG.getUNDEF(SVT));
21364 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
21366 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
21367 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
21370 // Lower SHL with variable shift amount.
21371 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
21372 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
21374 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
21375 DAG.getConstant(0x3f800000U, dl, VT));
21376 Op = DAG.getBitcast(MVT::v4f32, Op);
21377 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
21378 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
21381 // If possible, lower this shift as a sequence of two shifts by
21382 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
21384 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
21386 // Could be rewritten as:
21387 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
21389 // The advantage is that the two shifts from the example would be
21390 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
21391 // the vector shift into four scalar shifts plus four pairs of vector
21393 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
21394 unsigned TargetOpcode = X86ISD::MOVSS;
21395 bool CanBeSimplified;
21396 // The splat value for the first packed shift (the 'X' from the example).
21397 SDValue Amt1 = Amt->getOperand(0);
21398 // The splat value for the second packed shift (the 'Y' from the example).
21399 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
21401 // See if it is possible to replace this node with a sequence of
21402 // two shifts followed by a MOVSS/MOVSD/PBLEND.
21403 if (VT == MVT::v4i32) {
21404 // Check if it is legal to use a MOVSS.
21405 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
21406 Amt2 == Amt->getOperand(3);
21407 if (!CanBeSimplified) {
21408 // Otherwise, check if we can still simplify this node using a MOVSD.
21409 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
21410 Amt->getOperand(2) == Amt->getOperand(3);
21411 TargetOpcode = X86ISD::MOVSD;
21412 Amt2 = Amt->getOperand(2);
21415 // Do similar checks for the case where the machine value type
21417 CanBeSimplified = Amt1 == Amt->getOperand(1);
21418 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
21419 CanBeSimplified = Amt2 == Amt->getOperand(i);
21421 if (!CanBeSimplified) {
21422 TargetOpcode = X86ISD::MOVSD;
21423 CanBeSimplified = true;
21424 Amt2 = Amt->getOperand(4);
21425 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
21426 CanBeSimplified = Amt1 == Amt->getOperand(i);
21427 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
21428 CanBeSimplified = Amt2 == Amt->getOperand(j);
21432 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
21433 isa<ConstantSDNode>(Amt2)) {
21434 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
21435 MVT CastVT = MVT::v4i32;
21437 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
21438 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
21440 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
21441 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
21442 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
21443 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
21444 if (TargetOpcode == X86ISD::MOVSD)
21445 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21446 BitCast2, {0, 1, 6, 7}));
21447 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21448 BitCast2, {0, 5, 6, 7}));
21452 // v4i32 Non Uniform Shifts.
21453 // If the shift amount is constant we can shift each lane using the SSE2
21454 // immediate shifts, else we need to zero-extend each lane to the lower i64
21455 // and shift using the SSE2 variable shifts.
21456 // The separate results can then be blended together.
21457 if (VT == MVT::v4i32) {
21458 unsigned Opc = Op.getOpcode();
21459 SDValue Amt0, Amt1, Amt2, Amt3;
21461 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
21462 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
21463 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
21464 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
21466 // ISD::SHL is handled above but we include it here for completeness.
21469 llvm_unreachable("Unknown target vector shift node");
21471 Opc = X86ISD::VSHL;
21474 Opc = X86ISD::VSRL;
21477 Opc = X86ISD::VSRA;
21480 // The SSE2 shifts use the lower i64 as the same shift amount for
21481 // all lanes and the upper i64 is ignored. These shuffle masks
21482 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
21483 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21484 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
21485 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
21486 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
21487 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
21490 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
21491 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
21492 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
21493 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
21494 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
21495 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
21496 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
21499 if (VT == MVT::v16i8 ||
21500 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
21501 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
21502 unsigned ShiftOpcode = Op->getOpcode();
21504 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
21505 // On SSE41 targets we make use of the fact that VSELECT lowers
21506 // to PBLENDVB which selects bytes based just on the sign bit.
21507 if (Subtarget.hasSSE41()) {
21508 V0 = DAG.getBitcast(VT, V0);
21509 V1 = DAG.getBitcast(VT, V1);
21510 Sel = DAG.getBitcast(VT, Sel);
21511 return DAG.getBitcast(SelVT,
21512 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
21514 // On pre-SSE41 targets we test for the sign bit by comparing to
21515 // zero - a negative value will set all bits of the lanes to true
21516 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
21517 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
21518 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
21519 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
21522 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
21523 // We can safely do this using i16 shifts as we're only interested in
21524 // the 3 lower bits of each byte.
21525 Amt = DAG.getBitcast(ExtVT, Amt);
21526 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
21527 Amt = DAG.getBitcast(VT, Amt);
21529 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
21530 // r = VSELECT(r, shift(r, 4), a);
21532 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21533 R = SignBitSelect(VT, Amt, M, R);
21536 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21538 // r = VSELECT(r, shift(r, 2), a);
21539 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21540 R = SignBitSelect(VT, Amt, M, R);
21543 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21545 // return VSELECT(r, shift(r, 1), a);
21546 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21547 R = SignBitSelect(VT, Amt, M, R);
21551 if (Op->getOpcode() == ISD::SRA) {
21552 // For SRA we need to unpack each byte to the higher byte of a i16 vector
21553 // so we can correctly sign extend. We don't care what happens to the
21555 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
21556 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
21557 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
21558 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
21559 ALo = DAG.getBitcast(ExtVT, ALo);
21560 AHi = DAG.getBitcast(ExtVT, AHi);
21561 RLo = DAG.getBitcast(ExtVT, RLo);
21562 RHi = DAG.getBitcast(ExtVT, RHi);
21564 // r = VSELECT(r, shift(r, 4), a);
21565 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21566 DAG.getConstant(4, dl, ExtVT));
21567 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21568 DAG.getConstant(4, dl, ExtVT));
21569 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21570 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21573 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21574 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21576 // r = VSELECT(r, shift(r, 2), a);
21577 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21578 DAG.getConstant(2, dl, ExtVT));
21579 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21580 DAG.getConstant(2, dl, ExtVT));
21581 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21582 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21585 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21586 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21588 // r = VSELECT(r, shift(r, 1), a);
21589 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21590 DAG.getConstant(1, dl, ExtVT));
21591 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21592 DAG.getConstant(1, dl, ExtVT));
21593 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21594 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21596 // Logical shift the result back to the lower byte, leaving a zero upper
21598 // meaning that we can safely pack with PACKUSWB.
21600 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
21602 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
21603 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21607 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
21608 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
21609 // solution better.
21610 if (Subtarget.hasInt256() && VT == MVT::v8i16) {
21611 MVT ExtVT = MVT::v8i32;
21613 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21614 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
21615 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
21616 return DAG.getNode(ISD::TRUNCATE, dl, VT,
21617 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
21620 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
21621 MVT ExtVT = MVT::v8i32;
21622 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21623 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
21624 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
21625 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
21626 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
21627 ALo = DAG.getBitcast(ExtVT, ALo);
21628 AHi = DAG.getBitcast(ExtVT, AHi);
21629 RLo = DAG.getBitcast(ExtVT, RLo);
21630 RHi = DAG.getBitcast(ExtVT, RHi);
21631 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
21632 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
21633 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
21634 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
21635 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21638 if (VT == MVT::v8i16) {
21639 unsigned ShiftOpcode = Op->getOpcode();
21641 // If we have a constant shift amount, the non-SSE41 path is best as
21642 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
21643 bool UseSSE41 = Subtarget.hasSSE41() &&
21644 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21646 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
21647 // On SSE41 targets we make use of the fact that VSELECT lowers
21648 // to PBLENDVB which selects bytes based just on the sign bit.
21650 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
21651 V0 = DAG.getBitcast(ExtVT, V0);
21652 V1 = DAG.getBitcast(ExtVT, V1);
21653 Sel = DAG.getBitcast(ExtVT, Sel);
21654 return DAG.getBitcast(
21655 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
21657 // On pre-SSE41 targets we splat the sign bit - a negative value will
21658 // set all bits of the lanes to true and VSELECT uses that in
21659 // its OR(AND(V0,C),AND(V1,~C)) lowering.
21661 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
21662 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
21665 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
21667 // On SSE41 targets we need to replicate the shift mask in both
21668 // bytes for PBLENDVB.
21671 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
21672 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
21674 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
21677 // r = VSELECT(r, shift(r, 8), a);
21678 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
21679 R = SignBitSelect(Amt, M, R);
21682 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21684 // r = VSELECT(r, shift(r, 4), a);
21685 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21686 R = SignBitSelect(Amt, M, R);
21689 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21691 // r = VSELECT(r, shift(r, 2), a);
21692 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21693 R = SignBitSelect(Amt, M, R);
21696 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21698 // return VSELECT(r, shift(r, 1), a);
21699 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21700 R = SignBitSelect(Amt, M, R);
21704 // Decompose 256-bit shifts into smaller 128-bit shifts.
21705 if (VT.is256BitVector())
21706 return Lower256IntArith(Op, DAG);
21711 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
21712 SelectionDAG &DAG) {
21713 MVT VT = Op.getSimpleValueType();
21715 SDValue R = Op.getOperand(0);
21716 SDValue Amt = Op.getOperand(1);
21718 assert(VT.isVector() && "Custom lowering only for vector rotates!");
21719 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
21720 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
21722 // XOP has 128-bit vector variable + immediate rotates.
21723 // +ve/-ve Amt = rotate left/right.
21725 // Split 256-bit integers.
21726 if (VT.is256BitVector())
21727 return Lower256IntArith(Op, DAG);
21729 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
21731 // Attempt to rotate by immediate.
21732 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21733 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
21734 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
21735 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
21736 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
21737 DAG.getConstant(RotateAmt, DL, MVT::i8));
21741 // Use general rotate by variable (per-element).
21742 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
21745 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
21746 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
21747 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
21748 // looks for this combo and may remove the "setcc" instruction if the "setcc"
21749 // has only one use.
21750 SDNode *N = Op.getNode();
21751 SDValue LHS = N->getOperand(0);
21752 SDValue RHS = N->getOperand(1);
21753 unsigned BaseOp = 0;
21754 X86::CondCode Cond;
21756 switch (Op.getOpcode()) {
21757 default: llvm_unreachable("Unknown ovf instruction!");
21759 // A subtract of one will be selected as a INC. Note that INC doesn't
21760 // set CF, so we can't do this for UADDO.
21761 if (isOneConstant(RHS)) {
21762 BaseOp = X86ISD::INC;
21763 Cond = X86::COND_O;
21766 BaseOp = X86ISD::ADD;
21767 Cond = X86::COND_O;
21770 BaseOp = X86ISD::ADD;
21771 Cond = X86::COND_B;
21774 // A subtract of one will be selected as a DEC. Note that DEC doesn't
21775 // set CF, so we can't do this for USUBO.
21776 if (isOneConstant(RHS)) {
21777 BaseOp = X86ISD::DEC;
21778 Cond = X86::COND_O;
21781 BaseOp = X86ISD::SUB;
21782 Cond = X86::COND_O;
21785 BaseOp = X86ISD::SUB;
21786 Cond = X86::COND_B;
21789 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
21790 Cond = X86::COND_O;
21792 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
21793 if (N->getValueType(0) == MVT::i8) {
21794 BaseOp = X86ISD::UMUL8;
21795 Cond = X86::COND_O;
21798 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
21800 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
21802 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
21804 if (N->getValueType(1) == MVT::i1)
21805 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21807 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21811 // Also sets EFLAGS.
21812 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
21813 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
21815 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
21817 if (N->getValueType(1) == MVT::i1)
21818 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21820 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21823 /// Returns true if the operand type is exactly twice the native width, and
21824 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
21825 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
21826 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
21827 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
21828 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
21831 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
21832 else if (OpWidth == 128)
21833 return Subtarget.hasCmpxchg16b();
21838 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
21839 return needsCmpXchgNb(SI->getValueOperand()->getType());
21842 // Note: this turns large loads into lock cmpxchg8b/16b.
21843 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
21844 TargetLowering::AtomicExpansionKind
21845 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
21846 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
21847 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
21848 : AtomicExpansionKind::None;
21851 TargetLowering::AtomicExpansionKind
21852 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
21853 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
21854 Type *MemType = AI->getType();
21856 // If the operand is too big, we must see if cmpxchg8/16b is available
21857 // and default to library calls otherwise.
21858 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
21859 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
21860 : AtomicExpansionKind::None;
21863 AtomicRMWInst::BinOp Op = AI->getOperation();
21866 llvm_unreachable("Unknown atomic operation");
21867 case AtomicRMWInst::Xchg:
21868 case AtomicRMWInst::Add:
21869 case AtomicRMWInst::Sub:
21870 // It's better to use xadd, xsub or xchg for these in all cases.
21871 return AtomicExpansionKind::None;
21872 case AtomicRMWInst::Or:
21873 case AtomicRMWInst::And:
21874 case AtomicRMWInst::Xor:
21875 // If the atomicrmw's result isn't actually used, we can just add a "lock"
21876 // prefix to a normal instruction for these operations.
21877 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
21878 : AtomicExpansionKind::None;
21879 case AtomicRMWInst::Nand:
21880 case AtomicRMWInst::Max:
21881 case AtomicRMWInst::Min:
21882 case AtomicRMWInst::UMax:
21883 case AtomicRMWInst::UMin:
21884 // These always require a non-trivial set of data operations on x86. We must
21885 // use a cmpxchg loop.
21886 return AtomicExpansionKind::CmpXChg;
21891 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
21892 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
21893 Type *MemType = AI->getType();
21894 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
21895 // there is no benefit in turning such RMWs into loads, and it is actually
21896 // harmful as it introduces a mfence.
21897 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
21900 auto Builder = IRBuilder<>(AI);
21901 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21902 auto SynchScope = AI->getSynchScope();
21903 // We must restrict the ordering to avoid generating loads with Release or
21904 // ReleaseAcquire orderings.
21905 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
21906 auto Ptr = AI->getPointerOperand();
21908 // Before the load we need a fence. Here is an example lifted from
21909 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
21912 // x.store(1, relaxed);
21913 // r1 = y.fetch_add(0, release);
21915 // y.fetch_add(42, acquire);
21916 // r2 = x.load(relaxed);
21917 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
21918 // lowered to just a load without a fence. A mfence flushes the store buffer,
21919 // making the optimization clearly correct.
21920 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
21921 // otherwise, we might be able to be more aggressive on relaxed idempotent
21922 // rmw. In practice, they do not look useful, so we don't try to be
21923 // especially clever.
21924 if (SynchScope == SingleThread)
21925 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
21926 // the IR level, so we must wrap it in an intrinsic.
21929 if (!Subtarget.hasMFence())
21930 // FIXME: it might make sense to use a locked operation here but on a
21931 // different cache-line to prevent cache-line bouncing. In practice it
21932 // is probably a small win, and x86 processors without mfence are rare
21933 // enough that we do not bother.
21937 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
21938 Builder.CreateCall(MFence, {});
21940 // Finally we can emit the atomic load.
21941 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
21942 AI->getType()->getPrimitiveSizeInBits());
21943 Loaded->setAtomic(Order, SynchScope);
21944 AI->replaceAllUsesWith(Loaded);
21945 AI->eraseFromParent();
21949 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
21950 SelectionDAG &DAG) {
21952 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
21953 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
21954 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
21955 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
21957 // The only fence that needs an instruction is a sequentially-consistent
21958 // cross-thread fence.
21959 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
21960 FenceScope == CrossThread) {
21961 if (Subtarget.hasMFence())
21962 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
21964 SDValue Chain = Op.getOperand(0);
21965 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
21967 DAG.getRegister(X86::ESP, MVT::i32), // Base
21968 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
21969 DAG.getRegister(0, MVT::i32), // Index
21970 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
21971 DAG.getRegister(0, MVT::i32), // Segment.
21975 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
21976 return SDValue(Res, 0);
21979 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
21980 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
21983 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
21984 SelectionDAG &DAG) {
21985 MVT T = Op.getSimpleValueType();
21989 switch(T.SimpleTy) {
21990 default: llvm_unreachable("Invalid value type!");
21991 case MVT::i8: Reg = X86::AL; size = 1; break;
21992 case MVT::i16: Reg = X86::AX; size = 2; break;
21993 case MVT::i32: Reg = X86::EAX; size = 4; break;
21995 assert(Subtarget.is64Bit() && "Node not type legal!");
21996 Reg = X86::RAX; size = 8;
21999 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22000 Op.getOperand(2), SDValue());
22001 SDValue Ops[] = { cpIn.getValue(0),
22004 DAG.getTargetConstant(size, DL, MVT::i8),
22005 cpIn.getValue(1) };
22006 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22007 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22008 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22012 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22013 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22014 MVT::i32, cpOut.getValue(2));
22015 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22017 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22018 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22019 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22023 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22024 SelectionDAG &DAG) {
22025 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22026 MVT DstVT = Op.getSimpleValueType();
22028 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22029 SrcVT == MVT::i64) {
22030 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22031 if (DstVT != MVT::f64)
22032 // This conversion needs to be expanded.
22035 SDValue Op0 = Op->getOperand(0);
22036 SmallVector<SDValue, 16> Elts;
22040 if (SrcVT.isVector()) {
22041 NumElts = SrcVT.getVectorNumElements();
22042 SVT = SrcVT.getVectorElementType();
22044 // Widen the vector in input in the case of MVT::v2i32.
22045 // Example: from MVT::v2i32 to MVT::v4i32.
22046 for (unsigned i = 0, e = NumElts; i != e; ++i)
22047 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22048 DAG.getIntPtrConstant(i, dl)));
22050 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22051 "Unexpected source type in LowerBITCAST");
22052 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22053 DAG.getIntPtrConstant(0, dl)));
22054 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22055 DAG.getIntPtrConstant(1, dl)));
22059 // Explicitly mark the extra elements as Undef.
22060 Elts.append(NumElts, DAG.getUNDEF(SVT));
22062 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22063 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22064 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22065 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22066 DAG.getIntPtrConstant(0, dl));
22069 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22070 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22071 assert((DstVT == MVT::i64 ||
22072 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22073 "Unexpected custom BITCAST");
22074 // i64 <=> MMX conversions are Legal.
22075 if (SrcVT==MVT::i64 && DstVT.isVector())
22077 if (DstVT==MVT::i64 && SrcVT.isVector())
22079 // MMX <=> MMX conversions are Legal.
22080 if (SrcVT.isVector() && DstVT.isVector())
22082 // All other conversions need to be expanded.
22086 /// Compute the horizontal sum of bytes in V for the elements of VT.
22088 /// Requires V to be a byte vector and VT to be an integer vector type with
22089 /// wider elements than V's type. The width of the elements of VT determines
22090 /// how many bytes of V are summed horizontally to produce each element of the
22092 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22093 const X86Subtarget &Subtarget,
22094 SelectionDAG &DAG) {
22096 MVT ByteVecVT = V.getSimpleValueType();
22097 MVT EltVT = VT.getVectorElementType();
22098 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22099 "Expected value to have byte element type.");
22100 assert(EltVT != MVT::i8 &&
22101 "Horizontal byte sum only makes sense for wider elements!");
22102 unsigned VecSize = VT.getSizeInBits();
22103 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22105 // PSADBW instruction horizontally add all bytes and leave the result in i64
22106 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22107 if (EltVT == MVT::i64) {
22108 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22109 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22110 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22111 return DAG.getBitcast(VT, V);
22114 if (EltVT == MVT::i32) {
22115 // We unpack the low half and high half into i32s interleaved with zeros so
22116 // that we can use PSADBW to horizontally sum them. The most useful part of
22117 // this is that it lines up the results of two PSADBW instructions to be
22118 // two v2i64 vectors which concatenated are the 4 population counts. We can
22119 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22120 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22121 SDValue V32 = DAG.getBitcast(VT, V);
22122 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22123 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22125 // Do the horizontal sums into two v2i64s.
22126 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22127 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22128 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22129 DAG.getBitcast(ByteVecVT, Low), Zeros);
22130 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22131 DAG.getBitcast(ByteVecVT, High), Zeros);
22133 // Merge them together.
22134 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22135 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22136 DAG.getBitcast(ShortVecVT, Low),
22137 DAG.getBitcast(ShortVecVT, High));
22139 return DAG.getBitcast(VT, V);
22142 // The only element type left is i16.
22143 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22145 // To obtain pop count for each i16 element starting from the pop count for
22146 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22147 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22148 // directly supported.
22149 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22150 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22151 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22152 DAG.getBitcast(ByteVecVT, V));
22153 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22156 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22157 const X86Subtarget &Subtarget,
22158 SelectionDAG &DAG) {
22159 MVT VT = Op.getSimpleValueType();
22160 MVT EltVT = VT.getVectorElementType();
22161 unsigned VecSize = VT.getSizeInBits();
22163 // Implement a lookup table in register by using an algorithm based on:
22164 // http://wm.ite.pl/articles/sse-popcount.html
22166 // The general idea is that every lower byte nibble in the input vector is an
22167 // index into a in-register pre-computed pop count table. We then split up the
22168 // input vector in two new ones: (1) a vector with only the shifted-right
22169 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22170 // masked out higher ones) for each byte. PSHUB is used separately with both
22171 // to index the in-register table. Next, both are added and the result is a
22172 // i8 vector where each element contains the pop count for input byte.
22174 // To obtain the pop count for elements != i8, we follow up with the same
22175 // approach and use additional tricks as described below.
22177 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22178 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22179 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22180 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22182 int NumByteElts = VecSize / 8;
22183 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22184 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22185 SmallVector<SDValue, 64> LUTVec;
22186 for (int i = 0; i < NumByteElts; ++i)
22187 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22188 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22189 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22192 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22193 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22196 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22198 // The input vector is used as the shuffle mask that index elements into the
22199 // LUT. After counting low and high nibbles, add the vector to obtain the
22200 // final pop count per i8 element.
22201 SDValue HighPopCnt =
22202 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22203 SDValue LowPopCnt =
22204 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22205 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22207 if (EltVT == MVT::i8)
22210 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22213 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22214 const X86Subtarget &Subtarget,
22215 SelectionDAG &DAG) {
22216 MVT VT = Op.getSimpleValueType();
22217 assert(VT.is128BitVector() &&
22218 "Only 128-bit vector bitmath lowering supported.");
22220 int VecSize = VT.getSizeInBits();
22221 MVT EltVT = VT.getVectorElementType();
22222 int Len = EltVT.getSizeInBits();
22224 // This is the vectorized version of the "best" algorithm from
22225 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22226 // with a minor tweak to use a series of adds + shifts instead of vector
22227 // multiplications. Implemented for all integer vector types. We only use
22228 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22229 // much faster, even faster than using native popcnt instructions.
22231 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
22232 MVT VT = V.getSimpleValueType();
22233 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
22234 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
22236 auto GetMask = [&](SDValue V, APInt Mask) {
22237 MVT VT = V.getSimpleValueType();
22238 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
22239 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
22242 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
22243 // x86, so set the SRL type to have elements at least i16 wide. This is
22244 // correct because all of our SRLs are followed immediately by a mask anyways
22245 // that handles any bits that sneak into the high bits of the byte elements.
22246 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
22250 // v = v - ((v >> 1) & 0x55555555...)
22252 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
22253 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
22254 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
22256 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
22257 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
22258 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
22259 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
22260 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
22262 // v = (v + (v >> 4)) & 0x0F0F0F0F...
22263 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
22264 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
22265 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
22267 // At this point, V contains the byte-wise population count, and we are
22268 // merely doing a horizontal sum if necessary to get the wider element
22270 if (EltVT == MVT::i8)
22273 return LowerHorizontalByteSum(
22274 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
22278 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
22279 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
22280 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22281 SelectionDAG &DAG) {
22282 MVT VT = Op.getSimpleValueType();
22283 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
22284 "Unknown CTPOP type to handle");
22285 SDLoc DL(Op.getNode());
22286 SDValue Op0 = Op.getOperand(0);
22288 if (!Subtarget.hasSSSE3()) {
22289 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
22290 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
22291 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
22294 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22295 unsigned NumElems = VT.getVectorNumElements();
22297 // Extract each 128-bit vector, compute pop count and concat the result.
22298 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
22299 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
22301 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22302 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22303 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22306 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
22307 unsigned NumElems = VT.getVectorNumElements();
22309 // Extract each 256-bit vector, compute pop count and concat the result.
22310 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
22311 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
22313 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22314 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22315 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22318 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
22321 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22322 SelectionDAG &DAG) {
22323 assert(Op.getSimpleValueType().isVector() &&
22324 "We only do custom lowering for vector population count.");
22325 return LowerVectorCTPOP(Op, Subtarget, DAG);
22328 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
22329 MVT VT = Op.getSimpleValueType();
22330 SDValue In = Op.getOperand(0);
22333 // For scalars, its still beneficial to transfer to/from the SIMD unit to
22334 // perform the BITREVERSE.
22335 if (!VT.isVector()) {
22336 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
22337 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
22338 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
22339 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
22340 DAG.getIntPtrConstant(0, DL));
22343 MVT SVT = VT.getVectorElementType();
22344 int NumElts = VT.getVectorNumElements();
22345 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
22347 // Decompose 256-bit ops into smaller 128-bit ops.
22348 if (VT.is256BitVector()) {
22349 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22350 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22352 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
22353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22354 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
22355 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
22358 assert(VT.is128BitVector() &&
22359 "Only 128-bit vector bitreverse lowering supported.");
22361 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
22362 // perform the BSWAP in the shuffle.
22363 // Its best to shuffle using the second operand as this will implicitly allow
22364 // memory folding for multiple vectors.
22365 SmallVector<SDValue, 16> MaskElts;
22366 for (int i = 0; i != NumElts; ++i) {
22367 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
22368 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
22369 int PermuteByte = SourceByte | (2 << 5);
22370 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
22374 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
22375 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
22376 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
22378 return DAG.getBitcast(VT, Res);
22381 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
22382 SelectionDAG &DAG) {
22383 if (Subtarget.hasXOP())
22384 return LowerBITREVERSE_XOP(Op, DAG);
22386 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
22388 MVT VT = Op.getSimpleValueType();
22389 SDValue In = Op.getOperand(0);
22392 unsigned NumElts = VT.getVectorNumElements();
22393 assert(VT.getScalarType() == MVT::i8 &&
22394 "Only byte vector BITREVERSE supported");
22396 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
22397 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22398 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
22399 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22400 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22401 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
22402 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
22403 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22406 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
22407 // two nibbles and a PSHUFB lookup to find the bitreverse of each
22408 // 0-15 value (moved to the other nibble).
22409 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
22410 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
22411 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
22413 const int LoLUT[16] = {
22414 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
22415 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
22416 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
22417 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
22418 const int HiLUT[16] = {
22419 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
22420 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
22421 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
22422 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
22424 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
22425 for (unsigned i = 0; i < NumElts; ++i) {
22426 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
22427 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
22430 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
22431 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
22432 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
22433 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
22434 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
22437 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
22438 unsigned NewOpc = 0;
22439 switch (N->getOpcode()) {
22440 case ISD::ATOMIC_LOAD_ADD:
22441 NewOpc = X86ISD::LADD;
22443 case ISD::ATOMIC_LOAD_SUB:
22444 NewOpc = X86ISD::LSUB;
22446 case ISD::ATOMIC_LOAD_OR:
22447 NewOpc = X86ISD::LOR;
22449 case ISD::ATOMIC_LOAD_XOR:
22450 NewOpc = X86ISD::LXOR;
22452 case ISD::ATOMIC_LOAD_AND:
22453 NewOpc = X86ISD::LAND;
22456 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
22459 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
22460 return DAG.getMemIntrinsicNode(
22461 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
22462 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
22463 /*MemVT=*/N->getSimpleValueType(0), MMO);
22466 /// Lower atomic_load_ops into LOCK-prefixed operations.
22467 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
22468 const X86Subtarget &Subtarget) {
22469 SDValue Chain = N->getOperand(0);
22470 SDValue LHS = N->getOperand(1);
22471 SDValue RHS = N->getOperand(2);
22472 unsigned Opc = N->getOpcode();
22473 MVT VT = N->getSimpleValueType(0);
22476 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
22477 // can only be lowered when the result is unused. They should have already
22478 // been transformed into a cmpxchg loop in AtomicExpand.
22479 if (N->hasAnyUseOfValue(0)) {
22480 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
22481 // select LXADD if LOCK_SUB can't be selected.
22482 if (Opc == ISD::ATOMIC_LOAD_SUB) {
22483 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
22484 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
22485 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
22486 RHS, AN->getMemOperand());
22488 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
22489 "Used AtomicRMW ops other than Add should have been expanded!");
22493 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
22494 // RAUW the chain, but don't worry about the result, as it's unused.
22495 assert(!N->hasAnyUseOfValue(0));
22496 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
22500 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
22501 SDNode *Node = Op.getNode();
22503 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
22505 // Convert seq_cst store -> xchg
22506 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
22507 // FIXME: On 32-bit, store -> fist or movq would be more efficient
22508 // (The only way to get a 16-byte store is cmpxchg16b)
22509 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
22510 if (cast<AtomicSDNode>(Node)->getOrdering() ==
22511 AtomicOrdering::SequentiallyConsistent ||
22512 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
22513 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
22514 cast<AtomicSDNode>(Node)->getMemoryVT(),
22515 Node->getOperand(0),
22516 Node->getOperand(1), Node->getOperand(2),
22517 cast<AtomicSDNode>(Node)->getMemOperand());
22518 return Swap.getValue(1);
22520 // Other atomic stores have a simple pattern.
22524 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
22525 MVT VT = Op.getNode()->getSimpleValueType(0);
22527 // Let legalize expand this if it isn't a legal type yet.
22528 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
22531 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22534 bool ExtraOp = false;
22535 switch (Op.getOpcode()) {
22536 default: llvm_unreachable("Invalid code");
22537 case ISD::ADDC: Opc = X86ISD::ADD; break;
22538 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
22539 case ISD::SUBC: Opc = X86ISD::SUB; break;
22540 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
22544 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22546 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22547 Op.getOperand(1), Op.getOperand(2));
22550 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
22551 SelectionDAG &DAG) {
22552 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
22554 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
22555 // which returns the values as { float, float } (in XMM0) or
22556 // { double, double } (which is returned in XMM0, XMM1).
22558 SDValue Arg = Op.getOperand(0);
22559 EVT ArgVT = Arg.getValueType();
22560 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22562 TargetLowering::ArgListTy Args;
22563 TargetLowering::ArgListEntry Entry;
22567 Entry.isSExt = false;
22568 Entry.isZExt = false;
22569 Args.push_back(Entry);
22571 bool isF64 = ArgVT == MVT::f64;
22572 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
22573 // the small struct {f32, f32} is returned in (eax, edx). For f64,
22574 // the results are returned via SRet in memory.
22575 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
22576 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22578 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
22580 Type *RetTy = isF64
22581 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
22582 : (Type*)VectorType::get(ArgTy, 4);
22584 TargetLowering::CallLoweringInfo CLI(DAG);
22585 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
22586 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
22588 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
22591 // Returned in xmm0 and xmm1.
22592 return CallResult.first;
22594 // Returned in bits 0:31 and 32:64 xmm0.
22595 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22596 CallResult.first, DAG.getIntPtrConstant(0, dl));
22597 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22598 CallResult.first, DAG.getIntPtrConstant(1, dl));
22599 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
22600 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
22603 /// Widen a vector input to a vector of NVT. The
22604 /// input vector must have the same element type as NVT.
22605 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
22606 bool FillWithZeroes = false) {
22607 // Check if InOp already has the right width.
22608 MVT InVT = InOp.getSimpleValueType();
22612 if (InOp.isUndef())
22613 return DAG.getUNDEF(NVT);
22615 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
22616 "input and widen element type must match");
22618 unsigned InNumElts = InVT.getVectorNumElements();
22619 unsigned WidenNumElts = NVT.getVectorNumElements();
22620 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
22621 "Unexpected request for vector widening");
22623 EVT EltVT = NVT.getVectorElementType();
22626 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
22627 InOp.getNumOperands() == 2) {
22628 SDValue N1 = InOp.getOperand(1);
22629 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
22631 InOp = InOp.getOperand(0);
22632 InVT = InOp.getSimpleValueType();
22633 InNumElts = InVT.getVectorNumElements();
22636 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
22637 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
22638 SmallVector<SDValue, 16> Ops;
22639 for (unsigned i = 0; i < InNumElts; ++i)
22640 Ops.push_back(InOp.getOperand(i));
22642 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
22643 DAG.getUNDEF(EltVT);
22644 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
22645 Ops.push_back(FillVal);
22646 return DAG.getBuildVector(NVT, dl, Ops);
22648 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
22650 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
22651 InOp, DAG.getIntPtrConstant(0, dl));
22654 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
22655 SelectionDAG &DAG) {
22656 assert(Subtarget.hasAVX512() &&
22657 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22659 // X86 scatter kills mask register, so its type should be added to
22660 // the list of return values.
22661 // If the "scatter" has 2 return values, it is already handled.
22662 if (Op.getNode()->getNumValues() == 2)
22665 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
22666 SDValue Src = N->getValue();
22667 MVT VT = Src.getSimpleValueType();
22668 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
22671 SDValue NewScatter;
22672 SDValue Index = N->getIndex();
22673 SDValue Mask = N->getMask();
22674 SDValue Chain = N->getChain();
22675 SDValue BasePtr = N->getBasePtr();
22676 MVT MemVT = N->getMemoryVT().getSimpleVT();
22677 MVT IndexVT = Index.getSimpleValueType();
22678 MVT MaskVT = Mask.getSimpleValueType();
22680 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
22681 // The v2i32 value was promoted to v2i64.
22682 // Now we "redo" the type legalizer's work and widen the original
22683 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
22685 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
22686 "Unexpected memory type");
22687 int ShuffleMask[] = {0, 2, -1, -1};
22688 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
22689 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
22690 // Now we have 4 elements instead of 2.
22691 // Expand the index.
22692 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
22693 Index = ExtendToType(Index, NewIndexVT, DAG);
22695 // Expand the mask with zeroes
22696 // Mask may be <2 x i64> or <2 x i1> at this moment
22697 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
22698 "Unexpected mask type");
22699 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
22700 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
22704 unsigned NumElts = VT.getVectorNumElements();
22705 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
22706 !Index.getSimpleValueType().is512BitVector()) {
22707 // AVX512F supports only 512-bit vectors. Or data or index should
22708 // be 512 bit wide. If now the both index and data are 256-bit, but
22709 // the vector contains 8 elements, we just sign-extend the index
22710 if (IndexVT == MVT::v8i32)
22711 // Just extend index
22712 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22714 // The minimal number of elts in scatter is 8
22717 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
22718 // Use original index here, do not modify the index twice
22719 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
22720 if (IndexVT.getScalarType() == MVT::i32)
22721 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22724 // At this point we have promoted mask operand
22725 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
22726 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
22727 // Use the original mask here, do not modify the mask twice
22728 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
22730 // The value that should be stored
22731 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
22732 Src = ExtendToType(Src, NewVT, DAG);
22735 // If the mask is "wide" at this point - truncate it to i1 vector
22736 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
22737 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
22739 // The mask is killed by scatter, add it to the values
22740 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
22741 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
22742 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
22743 N->getMemOperand());
22744 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
22745 return SDValue(NewScatter.getNode(), 1);
22748 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
22749 SelectionDAG &DAG) {
22751 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
22752 MVT VT = Op.getSimpleValueType();
22753 MVT ScalarVT = VT.getScalarType();
22754 SDValue Mask = N->getMask();
22757 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
22758 "Expanding masked load is supported on AVX-512 target only!");
22760 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
22761 "Expanding masked load is supported for 32 and 64-bit types only!");
22763 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
22764 // VLX. These types for exp-loads are handled here.
22765 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
22768 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22769 "Cannot lower masked load op.");
22771 assert((ScalarVT.getSizeInBits() >= 32 ||
22772 (Subtarget.hasBWI() &&
22773 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22774 "Unsupported masked load op.");
22776 // This operation is legal for targets with VLX, but without
22777 // VLX the vector should be widened to 512 bit
22778 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
22779 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22780 SDValue Src0 = N->getSrc0();
22781 Src0 = ExtendToType(Src0, WideDataVT, DAG);
22783 // Mask element has to be i1.
22784 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22785 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22786 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22788 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22790 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22791 if (MaskEltTy != MVT::i1)
22792 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22793 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22794 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
22795 N->getBasePtr(), Mask, Src0,
22796 N->getMemoryVT(), N->getMemOperand(),
22797 N->getExtensionType(),
22798 N->isExpandingLoad());
22800 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
22801 NewLoad.getValue(0),
22802 DAG.getIntPtrConstant(0, dl));
22803 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
22804 return DAG.getMergeValues(RetOps, dl);
22807 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
22808 SelectionDAG &DAG) {
22809 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
22810 SDValue DataToStore = N->getValue();
22811 MVT VT = DataToStore.getSimpleValueType();
22812 MVT ScalarVT = VT.getScalarType();
22813 SDValue Mask = N->getMask();
22816 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
22817 "Expanding masked load is supported on AVX-512 target only!");
22819 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
22820 "Expanding masked load is supported for 32 and 64-bit types only!");
22822 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
22823 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
22826 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22827 "Cannot lower masked store op.");
22829 assert((ScalarVT.getSizeInBits() >= 32 ||
22830 (Subtarget.hasBWI() &&
22831 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22832 "Unsupported masked store op.");
22834 // This operation is legal for targets with VLX, but without
22835 // VLX the vector should be widened to 512 bit
22836 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
22837 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22839 // Mask element has to be i1.
22840 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22841 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22842 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22844 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22846 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
22847 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22848 if (MaskEltTy != MVT::i1)
22849 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22850 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22851 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
22852 Mask, N->getMemoryVT(), N->getMemOperand(),
22853 N->isTruncatingStore(), N->isCompressingStore());
22856 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
22857 SelectionDAG &DAG) {
22858 assert(Subtarget.hasAVX512() &&
22859 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22861 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
22863 MVT VT = Op.getSimpleValueType();
22864 SDValue Index = N->getIndex();
22865 SDValue Mask = N->getMask();
22866 SDValue Src0 = N->getValue();
22867 MVT IndexVT = Index.getSimpleValueType();
22868 MVT MaskVT = Mask.getSimpleValueType();
22870 unsigned NumElts = VT.getVectorNumElements();
22871 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
22873 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
22874 !Index.getSimpleValueType().is512BitVector()) {
22875 // AVX512F supports only 512-bit vectors. Or data or index should
22876 // be 512 bit wide. If now the both index and data are 256-bit, but
22877 // the vector contains 8 elements, we just sign-extend the index
22878 if (NumElts == 8) {
22879 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22880 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
22881 N->getOperand(3), Index };
22882 DAG.UpdateNodeOperands(N, Ops);
22886 // Minimal number of elements in Gather
22889 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
22890 Index = ExtendToType(Index, NewIndexVT, DAG);
22891 if (IndexVT.getScalarType() == MVT::i32)
22892 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22895 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
22896 // At this point we have promoted mask operand
22897 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
22898 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
22899 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
22900 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
22902 // The pass-thru value
22903 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
22904 Src0 = ExtendToType(Src0, NewVT, DAG);
22906 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
22907 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
22908 N->getMemoryVT(), dl, Ops,
22909 N->getMemOperand());
22910 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
22911 NewGather.getValue(0),
22912 DAG.getIntPtrConstant(0, dl));
22913 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
22914 return DAG.getMergeValues(RetOps, dl);
22919 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
22920 SelectionDAG &DAG) const {
22921 // TODO: Eventually, the lowering of these nodes should be informed by or
22922 // deferred to the GC strategy for the function in which they appear. For
22923 // now, however, they must be lowered to something. Since they are logically
22924 // no-ops in the case of a null GC strategy (or a GC strategy which does not
22925 // require special handling for these nodes), lower them as literal NOOPs for
22927 SmallVector<SDValue, 2> Ops;
22929 Ops.push_back(Op.getOperand(0));
22930 if (Op->getGluedNode())
22931 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
22934 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
22935 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
22940 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
22941 SelectionDAG &DAG) const {
22942 // TODO: Eventually, the lowering of these nodes should be informed by or
22943 // deferred to the GC strategy for the function in which they appear. For
22944 // now, however, they must be lowered to something. Since they are logically
22945 // no-ops in the case of a null GC strategy (or a GC strategy which does not
22946 // require special handling for these nodes), lower them as literal NOOPs for
22948 SmallVector<SDValue, 2> Ops;
22950 Ops.push_back(Op.getOperand(0));
22951 if (Op->getGluedNode())
22952 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
22955 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
22956 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
22961 /// Provide custom lowering hooks for some operations.
22962 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
22963 switch (Op.getOpcode()) {
22964 default: llvm_unreachable("Should not custom lower this!");
22965 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
22966 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
22967 return LowerCMP_SWAP(Op, Subtarget, DAG);
22968 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
22969 case ISD::ATOMIC_LOAD_ADD:
22970 case ISD::ATOMIC_LOAD_SUB:
22971 case ISD::ATOMIC_LOAD_OR:
22972 case ISD::ATOMIC_LOAD_XOR:
22973 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
22974 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
22975 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
22976 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
22977 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
22978 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
22979 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
22980 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
22981 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
22982 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
22983 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
22984 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
22985 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
22986 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
22987 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
22988 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
22989 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
22990 case ISD::SHL_PARTS:
22991 case ISD::SRA_PARTS:
22992 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
22993 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
22994 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
22995 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
22996 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
22997 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
22998 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
22999 case ISD::ZERO_EXTEND_VECTOR_INREG:
23000 case ISD::SIGN_EXTEND_VECTOR_INREG:
23001 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23002 case ISD::FP_TO_SINT:
23003 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
23004 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23005 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23007 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23008 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23009 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23010 case ISD::SETCC: return LowerSETCC(Op, DAG);
23011 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23012 case ISD::SELECT: return LowerSELECT(Op, DAG);
23013 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23014 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23015 case ISD::VASTART: return LowerVASTART(Op, DAG);
23016 case ISD::VAARG: return LowerVAARG(Op, DAG);
23017 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23018 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23019 case ISD::INTRINSIC_VOID:
23020 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23021 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23022 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23023 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23024 case ISD::FRAME_TO_ARGS_OFFSET:
23025 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23026 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23027 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23028 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23029 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23030 case ISD::EH_SJLJ_SETUP_DISPATCH:
23031 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23032 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23033 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23034 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23036 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23038 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23039 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23041 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23042 case ISD::UMUL_LOHI:
23043 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23044 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23047 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23053 case ISD::UMULO: return LowerXALUO(Op, DAG);
23054 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23055 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23059 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23060 case ISD::ADD: return LowerADD(Op, DAG);
23061 case ISD::SUB: return LowerSUB(Op, DAG);
23065 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23066 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23067 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23068 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23069 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23070 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23071 case ISD::GC_TRANSITION_START:
23072 return LowerGC_TRANSITION_START(Op, DAG);
23073 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23074 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23078 /// Places new result values for the node in Results (their number
23079 /// and types must exactly match those of the original return values of
23080 /// the node), or leaves Results empty, which indicates that the node is not
23081 /// to be custom lowered after all.
23082 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23083 SmallVectorImpl<SDValue> &Results,
23084 SelectionDAG &DAG) const {
23085 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23087 if (!Res.getNode())
23090 assert((N->getNumValues() <= Res->getNumValues()) &&
23091 "Lowering returned the wrong number of results!");
23093 // Places new result values base on N result number.
23094 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23095 // than original node, chain should be dropped(last value).
23096 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23097 Results.push_back(Res.getValue(I));
23100 /// Replace a node with an illegal result type with a new node built out of
23102 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23103 SmallVectorImpl<SDValue>&Results,
23104 SelectionDAG &DAG) const {
23106 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23107 switch (N->getOpcode()) {
23109 llvm_unreachable("Do not know how to custom type legalize this operation!");
23110 case X86ISD::AVG: {
23111 // Legalize types for X86ISD::AVG by expanding vectors.
23112 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23114 auto InVT = N->getValueType(0);
23115 auto InVTSize = InVT.getSizeInBits();
23116 const unsigned RegSize =
23117 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23118 assert((Subtarget.hasBWI() || RegSize < 512) &&
23119 "512-bit vector requires AVX512BW");
23120 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23121 "256-bit vector requires AVX2");
23123 auto ElemVT = InVT.getVectorElementType();
23124 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23125 RegSize / ElemVT.getSizeInBits());
23126 assert(RegSize % InVT.getSizeInBits() == 0);
23127 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23129 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23130 Ops[0] = N->getOperand(0);
23131 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23132 Ops[0] = N->getOperand(1);
23133 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23135 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23136 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23137 DAG.getIntPtrConstant(0, dl)));
23140 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23141 case X86ISD::FMINC:
23143 case X86ISD::FMAXC:
23144 case X86ISD::FMAX: {
23145 EVT VT = N->getValueType(0);
23146 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23147 SDValue UNDEF = DAG.getUNDEF(VT);
23148 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23149 N->getOperand(0), UNDEF);
23150 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23151 N->getOperand(1), UNDEF);
23152 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23160 case ISD::UDIVREM: {
23161 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23162 Results.push_back(V);
23165 case ISD::FP_TO_SINT:
23166 case ISD::FP_TO_UINT: {
23167 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23169 if (N->getValueType(0) == MVT::v2i32) {
23170 assert((IsSigned || Subtarget.hasAVX512()) &&
23171 "Can only handle signed conversion without AVX512");
23172 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23173 SDValue Src = N->getOperand(0);
23174 if (Src.getValueType() == MVT::v2f64) {
23175 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23176 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23177 : X86ISD::CVTTP2UI,
23178 dl, MVT::v4i32, Src);
23179 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23180 Results.push_back(Res);
23183 if (Src.getValueType() == MVT::v2f32) {
23184 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23185 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23186 DAG.getUNDEF(MVT::v2f32));
23187 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23188 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23189 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23190 Results.push_back(Res);
23194 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23195 // so early out here.
23199 std::pair<SDValue,SDValue> Vals =
23200 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23201 SDValue FIST = Vals.first, StackSlot = Vals.second;
23202 if (FIST.getNode()) {
23203 EVT VT = N->getValueType(0);
23204 // Return a load from the stack slot.
23205 if (StackSlot.getNode())
23207 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23209 Results.push_back(FIST);
23213 case ISD::SINT_TO_FP: {
23214 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23215 SDValue Src = N->getOperand(0);
23216 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23218 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23221 case ISD::UINT_TO_FP: {
23222 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23223 EVT VT = N->getValueType(0);
23224 if (VT != MVT::v2f32)
23226 SDValue Src = N->getOperand(0);
23227 EVT SrcVT = Src.getValueType();
23228 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23229 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23232 if (SrcVT != MVT::v2i32)
23234 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23236 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23237 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23238 DAG.getBitcast(MVT::v2i64, VBias));
23239 Or = DAG.getBitcast(MVT::v2f64, Or);
23240 // TODO: Are there any fast-math-flags to propagate here?
23241 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23242 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23245 case ISD::FP_ROUND: {
23246 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23248 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23249 Results.push_back(V);
23252 case ISD::FP_EXTEND: {
23253 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23254 // No other ValueType for FP_EXTEND should reach this point.
23255 assert(N->getValueType(0) == MVT::v2f32 &&
23256 "Do not know how to legalize this Node");
23259 case ISD::INTRINSIC_W_CHAIN: {
23260 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
23262 default : llvm_unreachable("Do not know how to custom type "
23263 "legalize this intrinsic operation!");
23264 case Intrinsic::x86_rdtsc:
23265 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23267 case Intrinsic::x86_rdtscp:
23268 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
23270 case Intrinsic::x86_rdpmc:
23271 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
23273 case Intrinsic::x86_xgetbv:
23274 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
23277 case ISD::INTRINSIC_WO_CHAIN: {
23278 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
23279 Results.push_back(V);
23282 case ISD::READCYCLECOUNTER: {
23283 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23286 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
23287 EVT T = N->getValueType(0);
23288 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
23289 bool Regs64bit = T == MVT::i128;
23290 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
23291 SDValue cpInL, cpInH;
23292 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23293 DAG.getConstant(0, dl, HalfT));
23294 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23295 DAG.getConstant(1, dl, HalfT));
23296 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
23297 Regs64bit ? X86::RAX : X86::EAX,
23299 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
23300 Regs64bit ? X86::RDX : X86::EDX,
23301 cpInH, cpInL.getValue(1));
23302 SDValue swapInL, swapInH;
23303 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23304 DAG.getConstant(0, dl, HalfT));
23305 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23306 DAG.getConstant(1, dl, HalfT));
23308 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
23309 swapInH, cpInH.getValue(1));
23310 // If the current function needs the base pointer, RBX,
23311 // we shouldn't use cmpxchg directly.
23312 // Indeed the lowering of that instruction will clobber
23313 // that register and since RBX will be a reserved register
23314 // the register allocator will not make sure its value will
23315 // be properly saved and restored around this live-range.
23316 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
23318 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23319 unsigned BasePtr = TRI->getBaseRegister();
23320 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
23321 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
23322 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
23323 // ISel prefers the LCMPXCHG64 variant.
23324 // If that assert breaks, that means it is not the case anymore,
23325 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
23326 // not just EBX. This is a matter of accepting i64 input for that
23327 // pseudo, and restoring into the register of the right wide
23328 // in expand pseudo. Everything else should just work.
23329 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
23330 "Saving only half of the RBX");
23331 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
23332 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
23333 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
23334 Regs64bit ? X86::RBX : X86::EBX,
23335 HalfT, swapInH.getValue(1));
23336 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
23338 /*Glue*/ RBXSave.getValue(2)};
23339 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23342 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
23343 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
23344 Regs64bit ? X86::RBX : X86::EBX, swapInL,
23345 swapInH.getValue(1));
23346 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
23347 swapInL.getValue(1)};
23348 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23350 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
23351 Regs64bit ? X86::RAX : X86::EAX,
23352 HalfT, Result.getValue(1));
23353 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
23354 Regs64bit ? X86::RDX : X86::EDX,
23355 HalfT, cpOutL.getValue(2));
23356 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
23358 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
23359 MVT::i32, cpOutH.getValue(2));
23360 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
23361 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
23363 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
23364 Results.push_back(Success);
23365 Results.push_back(EFLAGS.getValue(1));
23368 case ISD::ATOMIC_SWAP:
23369 case ISD::ATOMIC_LOAD_ADD:
23370 case ISD::ATOMIC_LOAD_SUB:
23371 case ISD::ATOMIC_LOAD_AND:
23372 case ISD::ATOMIC_LOAD_OR:
23373 case ISD::ATOMIC_LOAD_XOR:
23374 case ISD::ATOMIC_LOAD_NAND:
23375 case ISD::ATOMIC_LOAD_MIN:
23376 case ISD::ATOMIC_LOAD_MAX:
23377 case ISD::ATOMIC_LOAD_UMIN:
23378 case ISD::ATOMIC_LOAD_UMAX:
23379 case ISD::ATOMIC_LOAD: {
23380 // Delegate to generic TypeLegalization. Situations we can really handle
23381 // should have already been dealt with by AtomicExpandPass.cpp.
23384 case ISD::BITCAST: {
23385 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23386 EVT DstVT = N->getValueType(0);
23387 EVT SrcVT = N->getOperand(0)->getValueType(0);
23389 if (SrcVT != MVT::f64 ||
23390 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
23393 unsigned NumElts = DstVT.getVectorNumElements();
23394 EVT SVT = DstVT.getVectorElementType();
23395 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23396 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
23397 MVT::v2f64, N->getOperand(0));
23398 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
23400 if (ExperimentalVectorWideningLegalization) {
23401 // If we are legalizing vectors by widening, we already have the desired
23402 // legal vector type, just return it.
23403 Results.push_back(ToVecInt);
23407 SmallVector<SDValue, 8> Elts;
23408 for (unsigned i = 0, e = NumElts; i != e; ++i)
23409 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
23410 ToVecInt, DAG.getIntPtrConstant(i, dl)));
23412 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
23417 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
23418 switch ((X86ISD::NodeType)Opcode) {
23419 case X86ISD::FIRST_NUMBER: break;
23420 case X86ISD::BSF: return "X86ISD::BSF";
23421 case X86ISD::BSR: return "X86ISD::BSR";
23422 case X86ISD::SHLD: return "X86ISD::SHLD";
23423 case X86ISD::SHRD: return "X86ISD::SHRD";
23424 case X86ISD::FAND: return "X86ISD::FAND";
23425 case X86ISD::FANDN: return "X86ISD::FANDN";
23426 case X86ISD::FOR: return "X86ISD::FOR";
23427 case X86ISD::FXOR: return "X86ISD::FXOR";
23428 case X86ISD::FILD: return "X86ISD::FILD";
23429 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
23430 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
23431 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
23432 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
23433 case X86ISD::FLD: return "X86ISD::FLD";
23434 case X86ISD::FST: return "X86ISD::FST";
23435 case X86ISD::CALL: return "X86ISD::CALL";
23436 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
23437 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
23438 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
23439 case X86ISD::BT: return "X86ISD::BT";
23440 case X86ISD::CMP: return "X86ISD::CMP";
23441 case X86ISD::COMI: return "X86ISD::COMI";
23442 case X86ISD::UCOMI: return "X86ISD::UCOMI";
23443 case X86ISD::CMPM: return "X86ISD::CMPM";
23444 case X86ISD::CMPMU: return "X86ISD::CMPMU";
23445 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
23446 case X86ISD::SETCC: return "X86ISD::SETCC";
23447 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
23448 case X86ISD::FSETCC: return "X86ISD::FSETCC";
23449 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
23450 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
23451 case X86ISD::CMOV: return "X86ISD::CMOV";
23452 case X86ISD::BRCOND: return "X86ISD::BRCOND";
23453 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
23454 case X86ISD::IRET: return "X86ISD::IRET";
23455 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
23456 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
23457 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
23458 case X86ISD::Wrapper: return "X86ISD::Wrapper";
23459 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
23460 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
23461 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
23462 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
23463 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
23464 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
23465 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
23466 case X86ISD::PINSRB: return "X86ISD::PINSRB";
23467 case X86ISD::PINSRW: return "X86ISD::PINSRW";
23468 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
23469 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
23470 case X86ISD::ANDNP: return "X86ISD::ANDNP";
23471 case X86ISD::BLENDI: return "X86ISD::BLENDI";
23472 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
23473 case X86ISD::ADDUS: return "X86ISD::ADDUS";
23474 case X86ISD::SUBUS: return "X86ISD::SUBUS";
23475 case X86ISD::HADD: return "X86ISD::HADD";
23476 case X86ISD::HSUB: return "X86ISD::HSUB";
23477 case X86ISD::FHADD: return "X86ISD::FHADD";
23478 case X86ISD::FHSUB: return "X86ISD::FHSUB";
23479 case X86ISD::ABS: return "X86ISD::ABS";
23480 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
23481 case X86ISD::FMAX: return "X86ISD::FMAX";
23482 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
23483 case X86ISD::FMIN: return "X86ISD::FMIN";
23484 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
23485 case X86ISD::FMAXC: return "X86ISD::FMAXC";
23486 case X86ISD::FMINC: return "X86ISD::FMINC";
23487 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
23488 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
23489 case X86ISD::FRCP: return "X86ISD::FRCP";
23490 case X86ISD::FRCPS: return "X86ISD::FRCPS";
23491 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
23492 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
23493 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
23494 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
23495 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
23496 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
23497 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
23498 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
23499 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
23500 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
23501 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
23502 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
23503 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
23504 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
23505 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
23506 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
23507 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
23508 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
23509 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
23510 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
23511 case X86ISD::LADD: return "X86ISD::LADD";
23512 case X86ISD::LSUB: return "X86ISD::LSUB";
23513 case X86ISD::LOR: return "X86ISD::LOR";
23514 case X86ISD::LXOR: return "X86ISD::LXOR";
23515 case X86ISD::LAND: return "X86ISD::LAND";
23516 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
23517 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
23518 case X86ISD::VZEXT: return "X86ISD::VZEXT";
23519 case X86ISD::VSEXT: return "X86ISD::VSEXT";
23520 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
23521 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
23522 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
23523 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
23524 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
23525 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
23526 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
23527 case X86ISD::VINSERT: return "X86ISD::VINSERT";
23528 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
23529 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
23530 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
23531 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
23532 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
23533 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
23534 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
23535 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
23536 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
23537 case X86ISD::VSHL: return "X86ISD::VSHL";
23538 case X86ISD::VSRL: return "X86ISD::VSRL";
23539 case X86ISD::VSRA: return "X86ISD::VSRA";
23540 case X86ISD::VSHLI: return "X86ISD::VSHLI";
23541 case X86ISD::VSRLI: return "X86ISD::VSRLI";
23542 case X86ISD::VSRAI: return "X86ISD::VSRAI";
23543 case X86ISD::VSRAV: return "X86ISD::VSRAV";
23544 case X86ISD::VROTLI: return "X86ISD::VROTLI";
23545 case X86ISD::VROTRI: return "X86ISD::VROTRI";
23546 case X86ISD::VPPERM: return "X86ISD::VPPERM";
23547 case X86ISD::CMPP: return "X86ISD::CMPP";
23548 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
23549 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
23550 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
23551 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
23552 case X86ISD::ADD: return "X86ISD::ADD";
23553 case X86ISD::SUB: return "X86ISD::SUB";
23554 case X86ISD::ADC: return "X86ISD::ADC";
23555 case X86ISD::SBB: return "X86ISD::SBB";
23556 case X86ISD::SMUL: return "X86ISD::SMUL";
23557 case X86ISD::UMUL: return "X86ISD::UMUL";
23558 case X86ISD::SMUL8: return "X86ISD::SMUL8";
23559 case X86ISD::UMUL8: return "X86ISD::UMUL8";
23560 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
23561 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
23562 case X86ISD::INC: return "X86ISD::INC";
23563 case X86ISD::DEC: return "X86ISD::DEC";
23564 case X86ISD::OR: return "X86ISD::OR";
23565 case X86ISD::XOR: return "X86ISD::XOR";
23566 case X86ISD::AND: return "X86ISD::AND";
23567 case X86ISD::BEXTR: return "X86ISD::BEXTR";
23568 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
23569 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
23570 case X86ISD::PTEST: return "X86ISD::PTEST";
23571 case X86ISD::TESTP: return "X86ISD::TESTP";
23572 case X86ISD::TESTM: return "X86ISD::TESTM";
23573 case X86ISD::TESTNM: return "X86ISD::TESTNM";
23574 case X86ISD::KORTEST: return "X86ISD::KORTEST";
23575 case X86ISD::KTEST: return "X86ISD::KTEST";
23576 case X86ISD::PACKSS: return "X86ISD::PACKSS";
23577 case X86ISD::PACKUS: return "X86ISD::PACKUS";
23578 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
23579 case X86ISD::VALIGN: return "X86ISD::VALIGN";
23580 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
23581 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
23582 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
23583 case X86ISD::SHUFP: return "X86ISD::SHUFP";
23584 case X86ISD::SHUF128: return "X86ISD::SHUF128";
23585 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
23586 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
23587 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
23588 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
23589 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
23590 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
23591 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
23592 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
23593 case X86ISD::MOVSD: return "X86ISD::MOVSD";
23594 case X86ISD::MOVSS: return "X86ISD::MOVSS";
23595 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
23596 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
23597 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
23598 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
23599 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
23600 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
23601 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
23602 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
23603 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
23604 case X86ISD::VPERMV: return "X86ISD::VPERMV";
23605 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
23606 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
23607 case X86ISD::VPERMI: return "X86ISD::VPERMI";
23608 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
23609 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
23610 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
23611 case X86ISD::VRANGE: return "X86ISD::VRANGE";
23612 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
23613 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
23614 case X86ISD::PSADBW: return "X86ISD::PSADBW";
23615 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
23616 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
23617 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
23618 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
23619 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
23620 case X86ISD::MFENCE: return "X86ISD::MFENCE";
23621 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
23622 case X86ISD::SAHF: return "X86ISD::SAHF";
23623 case X86ISD::RDRAND: return "X86ISD::RDRAND";
23624 case X86ISD::RDSEED: return "X86ISD::RDSEED";
23625 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
23626 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
23627 case X86ISD::VPROT: return "X86ISD::VPROT";
23628 case X86ISD::VPROTI: return "X86ISD::VPROTI";
23629 case X86ISD::VPSHA: return "X86ISD::VPSHA";
23630 case X86ISD::VPSHL: return "X86ISD::VPSHL";
23631 case X86ISD::VPCOM: return "X86ISD::VPCOM";
23632 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
23633 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
23634 case X86ISD::FMADD: return "X86ISD::FMADD";
23635 case X86ISD::FMSUB: return "X86ISD::FMSUB";
23636 case X86ISD::FNMADD: return "X86ISD::FNMADD";
23637 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
23638 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
23639 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
23640 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
23641 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
23642 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
23643 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
23644 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
23645 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
23646 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
23647 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
23648 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
23649 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
23650 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
23651 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
23652 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
23653 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
23654 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
23655 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
23656 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
23657 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
23658 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
23659 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
23660 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
23661 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
23662 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
23663 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
23664 case X86ISD::XTEST: return "X86ISD::XTEST";
23665 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
23666 case X86ISD::EXPAND: return "X86ISD::EXPAND";
23667 case X86ISD::SELECT: return "X86ISD::SELECT";
23668 case X86ISD::SELECTS: return "X86ISD::SELECTS";
23669 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
23670 case X86ISD::RCP28: return "X86ISD::RCP28";
23671 case X86ISD::RCP28S: return "X86ISD::RCP28S";
23672 case X86ISD::EXP2: return "X86ISD::EXP2";
23673 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
23674 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
23675 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
23676 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
23677 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
23678 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
23679 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
23680 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
23681 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
23682 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
23683 case X86ISD::SCALEF: return "X86ISD::SCALEF";
23684 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
23685 case X86ISD::ADDS: return "X86ISD::ADDS";
23686 case X86ISD::SUBS: return "X86ISD::SUBS";
23687 case X86ISD::AVG: return "X86ISD::AVG";
23688 case X86ISD::MULHRS: return "X86ISD::MULHRS";
23689 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
23690 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
23691 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
23692 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
23693 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
23694 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
23695 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
23696 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
23697 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
23698 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
23699 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
23700 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
23701 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
23702 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
23703 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
23704 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
23705 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
23706 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
23707 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
23708 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
23709 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
23710 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
23711 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
23716 /// Return true if the addressing mode represented by AM is legal for this
23717 /// target, for a load/store of the specified type.
23718 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
23719 const AddrMode &AM, Type *Ty,
23720 unsigned AS) const {
23721 // X86 supports extremely general addressing modes.
23722 CodeModel::Model M = getTargetMachine().getCodeModel();
23724 // X86 allows a sign-extended 32-bit immediate field as a displacement.
23725 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
23729 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
23731 // If a reference to this global requires an extra load, we can't fold it.
23732 if (isGlobalStubReference(GVFlags))
23735 // If BaseGV requires a register for the PIC base, we cannot also have a
23736 // BaseReg specified.
23737 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
23740 // If lower 4G is not available, then we must use rip-relative addressing.
23741 if ((M != CodeModel::Small || isPositionIndependent()) &&
23742 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
23746 switch (AM.Scale) {
23752 // These scales always work.
23757 // These scales are formed with basereg+scalereg. Only accept if there is
23762 default: // Other stuff never works.
23769 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
23770 unsigned Bits = Ty->getScalarSizeInBits();
23772 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
23773 // particularly cheaper than those without.
23777 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
23778 // variable shifts just as cheap as scalar ones.
23779 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
23782 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
23783 // fully general vector.
23787 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
23788 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23790 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
23791 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
23792 return NumBits1 > NumBits2;
23795 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
23796 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23799 if (!isTypeLegal(EVT::getEVT(Ty1)))
23802 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
23804 // Assuming the caller doesn't have a zeroext or signext return parameter,
23805 // truncation all the way down to i1 is valid.
23809 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
23810 return isInt<32>(Imm);
23813 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
23814 // Can also use sub to handle negated immediates.
23815 return isInt<32>(Imm);
23818 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
23819 if (!VT1.isInteger() || !VT2.isInteger())
23821 unsigned NumBits1 = VT1.getSizeInBits();
23822 unsigned NumBits2 = VT2.getSizeInBits();
23823 return NumBits1 > NumBits2;
23826 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
23827 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23828 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
23831 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
23832 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23833 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
23836 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
23837 EVT VT1 = Val.getValueType();
23838 if (isZExtFree(VT1, VT2))
23841 if (Val.getOpcode() != ISD::LOAD)
23844 if (!VT1.isSimple() || !VT1.isInteger() ||
23845 !VT2.isSimple() || !VT2.isInteger())
23848 switch (VT1.getSimpleVT().SimpleTy) {
23853 // X86 has 8, 16, and 32-bit zero-extending loads.
23860 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
23863 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
23864 if (!Subtarget.hasAnyFMA())
23867 VT = VT.getScalarType();
23869 if (!VT.isSimple())
23872 switch (VT.getSimpleVT().SimpleTy) {
23883 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
23884 // i16 instructions are longer (0x66 prefix) and potentially slower.
23885 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
23888 /// Targets can use this to indicate that they only support *some*
23889 /// VECTOR_SHUFFLE operations, those with specific masks.
23890 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
23891 /// are assumed to be legal.
23893 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
23895 if (!VT.isSimple())
23898 // Not for i1 vectors
23899 if (VT.getSimpleVT().getScalarType() == MVT::i1)
23902 // Very little shuffling can be done for 64-bit vectors right now.
23903 if (VT.getSimpleVT().getSizeInBits() == 64)
23906 // We only care that the types being shuffled are legal. The lowering can
23907 // handle any possible shuffle mask that results.
23908 return isTypeLegal(VT.getSimpleVT());
23912 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
23914 // Just delegate to the generic legality, clear masks aren't special.
23915 return isShuffleMaskLegal(Mask, VT);
23918 //===----------------------------------------------------------------------===//
23919 // X86 Scheduler Hooks
23920 //===----------------------------------------------------------------------===//
23922 /// Utility function to emit xbegin specifying the start of an RTM region.
23923 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
23924 const TargetInstrInfo *TII) {
23925 DebugLoc DL = MI.getDebugLoc();
23927 const BasicBlock *BB = MBB->getBasicBlock();
23928 MachineFunction::iterator I = ++MBB->getIterator();
23930 // For the v = xbegin(), we generate
23941 MachineBasicBlock *thisMBB = MBB;
23942 MachineFunction *MF = MBB->getParent();
23943 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23944 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23945 MF->insert(I, mainMBB);
23946 MF->insert(I, sinkMBB);
23948 // Transfer the remainder of BB and its successor edges to sinkMBB.
23949 sinkMBB->splice(sinkMBB->begin(), MBB,
23950 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23951 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23955 // # fallthrough to mainMBB
23956 // # abortion to sinkMBB
23957 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
23958 thisMBB->addSuccessor(mainMBB);
23959 thisMBB->addSuccessor(sinkMBB);
23963 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
23964 mainMBB->addSuccessor(sinkMBB);
23967 // EAX is live into the sinkMBB
23968 sinkMBB->addLiveIn(X86::EAX);
23969 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
23970 MI.getOperand(0).getReg())
23973 MI.eraseFromParent();
23977 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
23978 // or XMM0_V32I8 in AVX all of this code can be replaced with that
23979 // in the .td file.
23980 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
23981 const TargetInstrInfo *TII) {
23983 switch (MI.getOpcode()) {
23984 default: llvm_unreachable("illegal opcode!");
23985 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
23986 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
23987 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
23988 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
23989 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
23990 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
23991 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
23992 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
23995 DebugLoc dl = MI.getDebugLoc();
23996 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
23998 unsigned NumArgs = MI.getNumOperands();
23999 for (unsigned i = 1; i < NumArgs; ++i) {
24000 MachineOperand &Op = MI.getOperand(i);
24001 if (!(Op.isReg() && Op.isImplicit()))
24002 MIB.addOperand(Op);
24004 if (MI.hasOneMemOperand())
24005 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24007 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24008 .addReg(X86::XMM0);
24010 MI.eraseFromParent();
24014 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24015 // defs in an instruction pattern
24016 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24017 const TargetInstrInfo *TII) {
24019 switch (MI.getOpcode()) {
24020 default: llvm_unreachable("illegal opcode!");
24021 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24022 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24023 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24024 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24025 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24026 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24027 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24028 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24031 DebugLoc dl = MI.getDebugLoc();
24032 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24034 unsigned NumArgs = MI.getNumOperands(); // remove the results
24035 for (unsigned i = 1; i < NumArgs; ++i) {
24036 MachineOperand &Op = MI.getOperand(i);
24037 if (!(Op.isReg() && Op.isImplicit()))
24038 MIB.addOperand(Op);
24040 if (MI.hasOneMemOperand())
24041 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24043 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24046 MI.eraseFromParent();
24050 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24051 const X86Subtarget &Subtarget) {
24052 DebugLoc dl = MI.getDebugLoc();
24053 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24055 // insert input VAL into EAX
24056 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24057 .addReg(MI.getOperand(0).getReg());
24058 // insert zero to ECX
24059 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24061 // insert zero to EDX
24062 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24064 // insert WRPKRU instruction
24065 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24067 MI.eraseFromParent(); // The pseudo is gone now.
24071 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24072 const X86Subtarget &Subtarget) {
24073 DebugLoc dl = MI.getDebugLoc();
24074 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24076 // insert zero to ECX
24077 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24079 // insert RDPKRU instruction
24080 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24081 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24084 MI.eraseFromParent(); // The pseudo is gone now.
24088 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24089 const X86Subtarget &Subtarget,
24091 DebugLoc dl = MI.getDebugLoc();
24092 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24093 // Address into RAX/EAX, other two args into ECX, EDX.
24094 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24095 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24096 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24097 for (int i = 0; i < X86::AddrNumOperands; ++i)
24098 MIB.addOperand(MI.getOperand(i));
24100 unsigned ValOps = X86::AddrNumOperands;
24101 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24102 .addReg(MI.getOperand(ValOps).getReg());
24103 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24104 .addReg(MI.getOperand(ValOps + 1).getReg());
24106 // The instruction doesn't actually take any operands though.
24107 BuildMI(*BB, MI, dl, TII->get(Opc));
24109 MI.eraseFromParent(); // The pseudo is gone now.
24113 MachineBasicBlock *
24114 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24115 MachineBasicBlock *MBB) const {
24116 // Emit va_arg instruction on X86-64.
24118 // Operands to this pseudo-instruction:
24119 // 0 ) Output : destination address (reg)
24120 // 1-5) Input : va_list address (addr, i64mem)
24121 // 6 ) ArgSize : Size (in bytes) of vararg type
24122 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24123 // 8 ) Align : Alignment of type
24124 // 9 ) EFLAGS (implicit-def)
24126 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24127 static_assert(X86::AddrNumOperands == 5,
24128 "VAARG_64 assumes 5 address operands");
24130 unsigned DestReg = MI.getOperand(0).getReg();
24131 MachineOperand &Base = MI.getOperand(1);
24132 MachineOperand &Scale = MI.getOperand(2);
24133 MachineOperand &Index = MI.getOperand(3);
24134 MachineOperand &Disp = MI.getOperand(4);
24135 MachineOperand &Segment = MI.getOperand(5);
24136 unsigned ArgSize = MI.getOperand(6).getImm();
24137 unsigned ArgMode = MI.getOperand(7).getImm();
24138 unsigned Align = MI.getOperand(8).getImm();
24140 // Memory Reference
24141 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24142 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24143 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24145 // Machine Information
24146 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24147 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24148 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24149 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24150 DebugLoc DL = MI.getDebugLoc();
24152 // struct va_list {
24155 // i64 overflow_area (address)
24156 // i64 reg_save_area (address)
24158 // sizeof(va_list) = 24
24159 // alignment(va_list) = 8
24161 unsigned TotalNumIntRegs = 6;
24162 unsigned TotalNumXMMRegs = 8;
24163 bool UseGPOffset = (ArgMode == 1);
24164 bool UseFPOffset = (ArgMode == 2);
24165 unsigned MaxOffset = TotalNumIntRegs * 8 +
24166 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24168 /* Align ArgSize to a multiple of 8 */
24169 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24170 bool NeedsAlign = (Align > 8);
24172 MachineBasicBlock *thisMBB = MBB;
24173 MachineBasicBlock *overflowMBB;
24174 MachineBasicBlock *offsetMBB;
24175 MachineBasicBlock *endMBB;
24177 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24178 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24179 unsigned OffsetReg = 0;
24181 if (!UseGPOffset && !UseFPOffset) {
24182 // If we only pull from the overflow region, we don't create a branch.
24183 // We don't need to alter control flow.
24184 OffsetDestReg = 0; // unused
24185 OverflowDestReg = DestReg;
24187 offsetMBB = nullptr;
24188 overflowMBB = thisMBB;
24191 // First emit code to check if gp_offset (or fp_offset) is below the bound.
24192 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24193 // If not, pull from overflow_area. (branch to overflowMBB)
24198 // offsetMBB overflowMBB
24203 // Registers for the PHI in endMBB
24204 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24205 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24207 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24208 MachineFunction *MF = MBB->getParent();
24209 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24210 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24211 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24213 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24215 // Insert the new basic blocks
24216 MF->insert(MBBIter, offsetMBB);
24217 MF->insert(MBBIter, overflowMBB);
24218 MF->insert(MBBIter, endMBB);
24220 // Transfer the remainder of MBB and its successor edges to endMBB.
24221 endMBB->splice(endMBB->begin(), thisMBB,
24222 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
24223 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
24225 // Make offsetMBB and overflowMBB successors of thisMBB
24226 thisMBB->addSuccessor(offsetMBB);
24227 thisMBB->addSuccessor(overflowMBB);
24229 // endMBB is a successor of both offsetMBB and overflowMBB
24230 offsetMBB->addSuccessor(endMBB);
24231 overflowMBB->addSuccessor(endMBB);
24233 // Load the offset value into a register
24234 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24235 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
24239 .addDisp(Disp, UseFPOffset ? 4 : 0)
24240 .addOperand(Segment)
24241 .setMemRefs(MMOBegin, MMOEnd);
24243 // Check if there is enough room left to pull this argument.
24244 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
24246 .addImm(MaxOffset + 8 - ArgSizeA8);
24248 // Branch to "overflowMBB" if offset >= max
24249 // Fall through to "offsetMBB" otherwise
24250 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
24251 .addMBB(overflowMBB);
24254 // In offsetMBB, emit code to use the reg_save_area.
24256 assert(OffsetReg != 0);
24258 // Read the reg_save_area address.
24259 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
24260 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
24265 .addOperand(Segment)
24266 .setMemRefs(MMOBegin, MMOEnd);
24268 // Zero-extend the offset
24269 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
24270 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
24273 .addImm(X86::sub_32bit);
24275 // Add the offset to the reg_save_area to get the final address.
24276 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
24277 .addReg(OffsetReg64)
24278 .addReg(RegSaveReg);
24280 // Compute the offset for the next argument
24281 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24282 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
24284 .addImm(UseFPOffset ? 16 : 8);
24286 // Store it back into the va_list.
24287 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
24291 .addDisp(Disp, UseFPOffset ? 4 : 0)
24292 .addOperand(Segment)
24293 .addReg(NextOffsetReg)
24294 .setMemRefs(MMOBegin, MMOEnd);
24297 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
24302 // Emit code to use overflow area
24305 // Load the overflow_area address into a register.
24306 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
24307 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
24312 .addOperand(Segment)
24313 .setMemRefs(MMOBegin, MMOEnd);
24315 // If we need to align it, do so. Otherwise, just copy the address
24316 // to OverflowDestReg.
24318 // Align the overflow address
24319 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
24320 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
24322 // aligned_addr = (addr + (align-1)) & ~(align-1)
24323 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
24324 .addReg(OverflowAddrReg)
24327 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
24329 .addImm(~(uint64_t)(Align-1));
24331 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
24332 .addReg(OverflowAddrReg);
24335 // Compute the next overflow address after this argument.
24336 // (the overflow address should be kept 8-byte aligned)
24337 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
24338 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
24339 .addReg(OverflowDestReg)
24340 .addImm(ArgSizeA8);
24342 // Store the new overflow address.
24343 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
24348 .addOperand(Segment)
24349 .addReg(NextAddrReg)
24350 .setMemRefs(MMOBegin, MMOEnd);
24352 // If we branched, emit the PHI to the front of endMBB.
24354 BuildMI(*endMBB, endMBB->begin(), DL,
24355 TII->get(X86::PHI), DestReg)
24356 .addReg(OffsetDestReg).addMBB(offsetMBB)
24357 .addReg(OverflowDestReg).addMBB(overflowMBB);
24360 // Erase the pseudo instruction
24361 MI.eraseFromParent();
24366 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
24367 MachineInstr &MI, MachineBasicBlock *MBB) const {
24368 // Emit code to save XMM registers to the stack. The ABI says that the
24369 // number of registers to save is given in %al, so it's theoretically
24370 // possible to do an indirect jump trick to avoid saving all of them,
24371 // however this code takes a simpler approach and just executes all
24372 // of the stores if %al is non-zero. It's less code, and it's probably
24373 // easier on the hardware branch predictor, and stores aren't all that
24374 // expensive anyway.
24376 // Create the new basic blocks. One block contains all the XMM stores,
24377 // and one block is the final destination regardless of whether any
24378 // stores were performed.
24379 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24380 MachineFunction *F = MBB->getParent();
24381 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24382 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
24383 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
24384 F->insert(MBBIter, XMMSaveMBB);
24385 F->insert(MBBIter, EndMBB);
24387 // Transfer the remainder of MBB and its successor edges to EndMBB.
24388 EndMBB->splice(EndMBB->begin(), MBB,
24389 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24390 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
24392 // The original block will now fall through to the XMM save block.
24393 MBB->addSuccessor(XMMSaveMBB);
24394 // The XMMSaveMBB will fall through to the end block.
24395 XMMSaveMBB->addSuccessor(EndMBB);
24397 // Now add the instructions.
24398 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24399 DebugLoc DL = MI.getDebugLoc();
24401 unsigned CountReg = MI.getOperand(0).getReg();
24402 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
24403 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
24405 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
24406 // If %al is 0, branch around the XMM save block.
24407 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
24408 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
24409 MBB->addSuccessor(EndMBB);
24412 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
24413 // that was just emitted, but clearly shouldn't be "saved".
24414 assert((MI.getNumOperands() <= 3 ||
24415 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
24416 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
24417 "Expected last argument to be EFLAGS");
24418 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
24419 // In the XMM save block, save all the XMM argument registers.
24420 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
24421 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
24422 MachineMemOperand *MMO = F->getMachineMemOperand(
24423 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
24424 MachineMemOperand::MOStore,
24425 /*Size=*/16, /*Align=*/16);
24426 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
24427 .addFrameIndex(RegSaveFrameIndex)
24428 .addImm(/*Scale=*/1)
24429 .addReg(/*IndexReg=*/0)
24430 .addImm(/*Disp=*/Offset)
24431 .addReg(/*Segment=*/0)
24432 .addReg(MI.getOperand(i).getReg())
24433 .addMemOperand(MMO);
24436 MI.eraseFromParent(); // The pseudo instruction is gone now.
24441 // The EFLAGS operand of SelectItr might be missing a kill marker
24442 // because there were multiple uses of EFLAGS, and ISel didn't know
24443 // which to mark. Figure out whether SelectItr should have had a
24444 // kill marker, and set it if it should. Returns the correct kill
24446 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
24447 MachineBasicBlock* BB,
24448 const TargetRegisterInfo* TRI) {
24449 // Scan forward through BB for a use/def of EFLAGS.
24450 MachineBasicBlock::iterator miI(std::next(SelectItr));
24451 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
24452 const MachineInstr& mi = *miI;
24453 if (mi.readsRegister(X86::EFLAGS))
24455 if (mi.definesRegister(X86::EFLAGS))
24456 break; // Should have kill-flag - update below.
24459 // If we hit the end of the block, check whether EFLAGS is live into a
24461 if (miI == BB->end()) {
24462 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
24463 sEnd = BB->succ_end();
24464 sItr != sEnd; ++sItr) {
24465 MachineBasicBlock* succ = *sItr;
24466 if (succ->isLiveIn(X86::EFLAGS))
24471 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
24472 // out. SelectMI should have a kill flag on EFLAGS.
24473 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
24477 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
24478 // together with other CMOV pseudo-opcodes into a single basic-block with
24479 // conditional jump around it.
24480 static bool isCMOVPseudo(MachineInstr &MI) {
24481 switch (MI.getOpcode()) {
24482 case X86::CMOV_FR32:
24483 case X86::CMOV_FR64:
24484 case X86::CMOV_GR8:
24485 case X86::CMOV_GR16:
24486 case X86::CMOV_GR32:
24487 case X86::CMOV_RFP32:
24488 case X86::CMOV_RFP64:
24489 case X86::CMOV_RFP80:
24490 case X86::CMOV_V2F64:
24491 case X86::CMOV_V2I64:
24492 case X86::CMOV_V4F32:
24493 case X86::CMOV_V4F64:
24494 case X86::CMOV_V4I64:
24495 case X86::CMOV_V16F32:
24496 case X86::CMOV_V8F32:
24497 case X86::CMOV_V8F64:
24498 case X86::CMOV_V8I64:
24499 case X86::CMOV_V8I1:
24500 case X86::CMOV_V16I1:
24501 case X86::CMOV_V32I1:
24502 case X86::CMOV_V64I1:
24510 MachineBasicBlock *
24511 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
24512 MachineBasicBlock *BB) const {
24513 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24514 DebugLoc DL = MI.getDebugLoc();
24516 // To "insert" a SELECT_CC instruction, we actually have to insert the
24517 // diamond control-flow pattern. The incoming instruction knows the
24518 // destination vreg to set, the condition code register to branch on, the
24519 // true/false values to select between, and a branch opcode to use.
24520 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24521 MachineFunction::iterator It = ++BB->getIterator();
24526 // cmpTY ccX, r1, r2
24528 // fallthrough --> copy0MBB
24529 MachineBasicBlock *thisMBB = BB;
24530 MachineFunction *F = BB->getParent();
24532 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
24533 // as described above, by inserting a BB, and then making a PHI at the join
24534 // point to select the true and false operands of the CMOV in the PHI.
24536 // The code also handles two different cases of multiple CMOV opcodes
24540 // In this case, there are multiple CMOVs in a row, all which are based on
24541 // the same condition setting (or the exact opposite condition setting).
24542 // In this case we can lower all the CMOVs using a single inserted BB, and
24543 // then make a number of PHIs at the join point to model the CMOVs. The only
24544 // trickiness here, is that in a case like:
24546 // t2 = CMOV cond1 t1, f1
24547 // t3 = CMOV cond1 t2, f2
24549 // when rewriting this into PHIs, we have to perform some renaming on the
24550 // temps since you cannot have a PHI operand refer to a PHI result earlier
24551 // in the same block. The "simple" but wrong lowering would be:
24553 // t2 = PHI t1(BB1), f1(BB2)
24554 // t3 = PHI t2(BB1), f2(BB2)
24556 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
24557 // renaming is to note that on the path through BB1, t2 is really just a
24558 // copy of t1, and do that renaming, properly generating:
24560 // t2 = PHI t1(BB1), f1(BB2)
24561 // t3 = PHI t1(BB1), f2(BB2)
24563 // Case 2, we lower cascaded CMOVs such as
24565 // (CMOV (CMOV F, T, cc1), T, cc2)
24567 // to two successives branches. For that, we look for another CMOV as the
24568 // following instruction.
24570 // Without this, we would add a PHI between the two jumps, which ends up
24571 // creating a few copies all around. For instance, for
24573 // (sitofp (zext (fcmp une)))
24575 // we would generate:
24577 // ucomiss %xmm1, %xmm0
24578 // movss <1.0f>, %xmm0
24579 // movaps %xmm0, %xmm1
24581 // xorps %xmm1, %xmm1
24584 // movaps %xmm1, %xmm0
24588 // because this custom-inserter would have generated:
24600 // A: X = ...; Y = ...
24602 // C: Z = PHI [X, A], [Y, B]
24604 // E: PHI [X, C], [Z, D]
24606 // If we lower both CMOVs in a single step, we can instead generate:
24618 // A: X = ...; Y = ...
24620 // E: PHI [X, A], [X, C], [Y, D]
24622 // Which, in our sitofp/fcmp example, gives us something like:
24624 // ucomiss %xmm1, %xmm0
24625 // movss <1.0f>, %xmm0
24628 // xorps %xmm0, %xmm0
24632 MachineInstr *CascadedCMOV = nullptr;
24633 MachineInstr *LastCMOV = &MI;
24634 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
24635 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
24636 MachineBasicBlock::iterator NextMIIt =
24637 std::next(MachineBasicBlock::iterator(MI));
24639 // Check for case 1, where there are multiple CMOVs with the same condition
24640 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
24641 // number of jumps the most.
24643 if (isCMOVPseudo(MI)) {
24644 // See if we have a string of CMOVS with the same condition.
24645 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
24646 (NextMIIt->getOperand(3).getImm() == CC ||
24647 NextMIIt->getOperand(3).getImm() == OppCC)) {
24648 LastCMOV = &*NextMIIt;
24653 // This checks for case 2, but only do this if we didn't already find
24654 // case 1, as indicated by LastCMOV == MI.
24655 if (LastCMOV == &MI && NextMIIt != BB->end() &&
24656 NextMIIt->getOpcode() == MI.getOpcode() &&
24657 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
24658 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
24659 NextMIIt->getOperand(1).isKill()) {
24660 CascadedCMOV = &*NextMIIt;
24663 MachineBasicBlock *jcc1MBB = nullptr;
24665 // If we have a cascaded CMOV, we lower it to two successive branches to
24666 // the same block. EFLAGS is used by both, so mark it as live in the second.
24667 if (CascadedCMOV) {
24668 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
24669 F->insert(It, jcc1MBB);
24670 jcc1MBB->addLiveIn(X86::EFLAGS);
24673 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
24674 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
24675 F->insert(It, copy0MBB);
24676 F->insert(It, sinkMBB);
24678 // If the EFLAGS register isn't dead in the terminator, then claim that it's
24679 // live into the sink and copy blocks.
24680 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
24682 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
24683 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
24684 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
24685 copy0MBB->addLiveIn(X86::EFLAGS);
24686 sinkMBB->addLiveIn(X86::EFLAGS);
24689 // Transfer the remainder of BB and its successor edges to sinkMBB.
24690 sinkMBB->splice(sinkMBB->begin(), BB,
24691 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
24692 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
24694 // Add the true and fallthrough blocks as its successors.
24695 if (CascadedCMOV) {
24696 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
24697 BB->addSuccessor(jcc1MBB);
24699 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
24700 // jump to the sinkMBB.
24701 jcc1MBB->addSuccessor(copy0MBB);
24702 jcc1MBB->addSuccessor(sinkMBB);
24704 BB->addSuccessor(copy0MBB);
24707 // The true block target of the first (or only) branch is always sinkMBB.
24708 BB->addSuccessor(sinkMBB);
24710 // Create the conditional branch instruction.
24711 unsigned Opc = X86::GetCondBranchFromCond(CC);
24712 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
24714 if (CascadedCMOV) {
24715 unsigned Opc2 = X86::GetCondBranchFromCond(
24716 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
24717 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
24721 // %FalseValue = ...
24722 // # fallthrough to sinkMBB
24723 copy0MBB->addSuccessor(sinkMBB);
24726 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
24728 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
24729 MachineBasicBlock::iterator MIItEnd =
24730 std::next(MachineBasicBlock::iterator(LastCMOV));
24731 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
24732 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
24733 MachineInstrBuilder MIB;
24735 // As we are creating the PHIs, we have to be careful if there is more than
24736 // one. Later CMOVs may reference the results of earlier CMOVs, but later
24737 // PHIs have to reference the individual true/false inputs from earlier PHIs.
24738 // That also means that PHI construction must work forward from earlier to
24739 // later, and that the code must maintain a mapping from earlier PHI's
24740 // destination registers, and the registers that went into the PHI.
24742 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
24743 unsigned DestReg = MIIt->getOperand(0).getReg();
24744 unsigned Op1Reg = MIIt->getOperand(1).getReg();
24745 unsigned Op2Reg = MIIt->getOperand(2).getReg();
24747 // If this CMOV we are generating is the opposite condition from
24748 // the jump we generated, then we have to swap the operands for the
24749 // PHI that is going to be generated.
24750 if (MIIt->getOperand(3).getImm() == OppCC)
24751 std::swap(Op1Reg, Op2Reg);
24753 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
24754 Op1Reg = RegRewriteTable[Op1Reg].first;
24756 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
24757 Op2Reg = RegRewriteTable[Op2Reg].second;
24759 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
24760 TII->get(X86::PHI), DestReg)
24761 .addReg(Op1Reg).addMBB(copy0MBB)
24762 .addReg(Op2Reg).addMBB(thisMBB);
24764 // Add this PHI to the rewrite table.
24765 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
24768 // If we have a cascaded CMOV, the second Jcc provides the same incoming
24769 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
24770 if (CascadedCMOV) {
24771 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
24772 // Copy the PHI result to the register defined by the second CMOV.
24773 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
24774 DL, TII->get(TargetOpcode::COPY),
24775 CascadedCMOV->getOperand(0).getReg())
24776 .addReg(MI.getOperand(0).getReg());
24777 CascadedCMOV->eraseFromParent();
24780 // Now remove the CMOV(s).
24781 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
24782 (MIIt++)->eraseFromParent();
24787 MachineBasicBlock *
24788 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
24789 MachineBasicBlock *BB) const {
24790 // Combine the following atomic floating-point modification pattern:
24791 // a.store(reg OP a.load(acquire), release)
24792 // Transform them into:
24793 // OPss (%gpr), %xmm
24794 // movss %xmm, (%gpr)
24795 // Or sd equivalent for 64-bit operations.
24797 switch (MI.getOpcode()) {
24798 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
24799 case X86::RELEASE_FADD32mr:
24800 FOp = X86::ADDSSrm;
24801 MOp = X86::MOVSSmr;
24803 case X86::RELEASE_FADD64mr:
24804 FOp = X86::ADDSDrm;
24805 MOp = X86::MOVSDmr;
24808 const X86InstrInfo *TII = Subtarget.getInstrInfo();
24809 DebugLoc DL = MI.getDebugLoc();
24810 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
24811 unsigned ValOpIdx = X86::AddrNumOperands;
24812 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
24813 MachineInstrBuilder MIB =
24814 BuildMI(*BB, MI, DL, TII->get(FOp),
24815 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
24817 for (int i = 0; i < X86::AddrNumOperands; ++i) {
24818 MachineOperand &Operand = MI.getOperand(i);
24819 // Clear any kill flags on register operands as we'll create a second
24820 // instruction using the same address operands.
24821 if (Operand.isReg())
24822 Operand.setIsKill(false);
24823 MIB.addOperand(Operand);
24825 MachineInstr *FOpMI = MIB;
24826 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
24827 for (int i = 0; i < X86::AddrNumOperands; ++i)
24828 MIB.addOperand(MI.getOperand(i));
24829 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
24830 MI.eraseFromParent(); // The pseudo instruction is gone now.
24834 MachineBasicBlock *
24835 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
24836 MachineBasicBlock *BB) const {
24837 MachineFunction *MF = BB->getParent();
24838 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24839 DebugLoc DL = MI.getDebugLoc();
24840 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24842 assert(MF->shouldSplitStack());
24844 const bool Is64Bit = Subtarget.is64Bit();
24845 const bool IsLP64 = Subtarget.isTarget64BitLP64();
24847 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
24848 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
24851 // ... [Till the alloca]
24852 // If stacklet is not large enough, jump to mallocMBB
24855 // Allocate by subtracting from RSP
24856 // Jump to continueMBB
24859 // Allocate by call to runtime
24863 // [rest of original BB]
24866 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24867 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24868 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24870 MachineRegisterInfo &MRI = MF->getRegInfo();
24871 const TargetRegisterClass *AddrRegClass =
24872 getRegClassFor(getPointerTy(MF->getDataLayout()));
24874 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
24875 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
24876 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
24877 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
24878 sizeVReg = MI.getOperand(1).getReg(),
24880 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
24882 MachineFunction::iterator MBBIter = ++BB->getIterator();
24884 MF->insert(MBBIter, bumpMBB);
24885 MF->insert(MBBIter, mallocMBB);
24886 MF->insert(MBBIter, continueMBB);
24888 continueMBB->splice(continueMBB->begin(), BB,
24889 std::next(MachineBasicBlock::iterator(MI)), BB->end());
24890 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
24892 // Add code to the main basic block to check if the stack limit has been hit,
24893 // and if so, jump to mallocMBB otherwise to bumpMBB.
24894 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
24895 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
24896 .addReg(tmpSPVReg).addReg(sizeVReg);
24897 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
24898 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
24899 .addReg(SPLimitVReg);
24900 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
24902 // bumpMBB simply decreases the stack pointer, since we know the current
24903 // stacklet has enough space.
24904 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
24905 .addReg(SPLimitVReg);
24906 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
24907 .addReg(SPLimitVReg);
24908 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
24910 // Calls into a routine in libgcc to allocate more space from the heap.
24911 const uint32_t *RegMask =
24912 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
24914 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
24916 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
24917 .addExternalSymbol("__morestack_allocate_stack_space")
24918 .addRegMask(RegMask)
24919 .addReg(X86::RDI, RegState::Implicit)
24920 .addReg(X86::RAX, RegState::ImplicitDefine);
24921 } else if (Is64Bit) {
24922 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
24924 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
24925 .addExternalSymbol("__morestack_allocate_stack_space")
24926 .addRegMask(RegMask)
24927 .addReg(X86::EDI, RegState::Implicit)
24928 .addReg(X86::EAX, RegState::ImplicitDefine);
24930 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
24932 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
24933 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
24934 .addExternalSymbol("__morestack_allocate_stack_space")
24935 .addRegMask(RegMask)
24936 .addReg(X86::EAX, RegState::ImplicitDefine);
24940 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
24943 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
24944 .addReg(IsLP64 ? X86::RAX : X86::EAX);
24945 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
24947 // Set up the CFG correctly.
24948 BB->addSuccessor(bumpMBB);
24949 BB->addSuccessor(mallocMBB);
24950 mallocMBB->addSuccessor(continueMBB);
24951 bumpMBB->addSuccessor(continueMBB);
24953 // Take care of the PHI nodes.
24954 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
24955 MI.getOperand(0).getReg())
24956 .addReg(mallocPtrVReg)
24958 .addReg(bumpSPPtrVReg)
24961 // Delete the original pseudo instruction.
24962 MI.eraseFromParent();
24965 return continueMBB;
24968 MachineBasicBlock *
24969 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
24970 MachineBasicBlock *BB) const {
24971 MachineFunction *MF = BB->getParent();
24972 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
24973 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
24974 DebugLoc DL = MI.getDebugLoc();
24976 assert(!isAsynchronousEHPersonality(
24977 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
24978 "SEH does not use catchret!");
24980 // Only 32-bit EH needs to worry about manually restoring stack pointers.
24981 if (!Subtarget.is32Bit())
24984 // C++ EH creates a new target block to hold the restore code, and wires up
24985 // the new block to the return destination with a normal JMP_4.
24986 MachineBasicBlock *RestoreMBB =
24987 MF->CreateMachineBasicBlock(BB->getBasicBlock());
24988 assert(BB->succ_size() == 1);
24989 MF->insert(std::next(BB->getIterator()), RestoreMBB);
24990 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
24991 BB->addSuccessor(RestoreMBB);
24992 MI.getOperand(0).setMBB(RestoreMBB);
24994 auto RestoreMBBI = RestoreMBB->begin();
24995 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
24996 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25000 MachineBasicBlock *
25001 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25002 MachineBasicBlock *BB) const {
25003 MachineFunction *MF = BB->getParent();
25004 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25005 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25006 // Only 32-bit SEH requires special handling for catchpad.
25007 if (IsSEH && Subtarget.is32Bit()) {
25008 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25009 DebugLoc DL = MI.getDebugLoc();
25010 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25012 MI.eraseFromParent();
25016 MachineBasicBlock *
25017 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25018 MachineBasicBlock *BB) const {
25019 // So, here we replace TLSADDR with the sequence:
25020 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25021 // We need this because TLSADDR is lowered into calls
25022 // inside MC, therefore without the two markers shrink-wrapping
25023 // may push the prologue/epilogue pass them.
25024 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25025 DebugLoc DL = MI.getDebugLoc();
25026 MachineFunction &MF = *BB->getParent();
25028 // Emit CALLSEQ_START right before the instruction.
25029 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25030 MachineInstrBuilder CallseqStart =
25031 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25032 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25034 // Emit CALLSEQ_END right after the instruction.
25035 // We don't call erase from parent because we want to keep the
25036 // original instruction around.
25037 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25038 MachineInstrBuilder CallseqEnd =
25039 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25040 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25045 MachineBasicBlock *
25046 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25047 MachineBasicBlock *BB) const {
25048 // This is pretty easy. We're taking the value that we received from
25049 // our load from the relocation, sticking it in either RDI (x86-64)
25050 // or EAX and doing an indirect call. The return value will then
25051 // be in the normal return register.
25052 MachineFunction *F = BB->getParent();
25053 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25054 DebugLoc DL = MI.getDebugLoc();
25056 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25057 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25059 // Get a register mask for the lowered call.
25060 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25061 // proper register mask.
25062 const uint32_t *RegMask =
25063 Subtarget.is64Bit() ?
25064 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25065 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25066 if (Subtarget.is64Bit()) {
25067 MachineInstrBuilder MIB =
25068 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25072 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25073 MI.getOperand(3).getTargetFlags())
25075 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25076 addDirectMem(MIB, X86::RDI);
25077 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25078 } else if (!isPositionIndependent()) {
25079 MachineInstrBuilder MIB =
25080 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25084 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25085 MI.getOperand(3).getTargetFlags())
25087 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25088 addDirectMem(MIB, X86::EAX);
25089 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25091 MachineInstrBuilder MIB =
25092 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25093 .addReg(TII->getGlobalBaseReg(F))
25096 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25097 MI.getOperand(3).getTargetFlags())
25099 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25100 addDirectMem(MIB, X86::EAX);
25101 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25104 MI.eraseFromParent(); // The pseudo instruction is gone now.
25108 MachineBasicBlock *
25109 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25110 MachineBasicBlock *MBB) const {
25111 DebugLoc DL = MI.getDebugLoc();
25112 MachineFunction *MF = MBB->getParent();
25113 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25114 MachineRegisterInfo &MRI = MF->getRegInfo();
25116 const BasicBlock *BB = MBB->getBasicBlock();
25117 MachineFunction::iterator I = ++MBB->getIterator();
25119 // Memory Reference
25120 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25121 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25124 unsigned MemOpndSlot = 0;
25126 unsigned CurOp = 0;
25128 DstReg = MI.getOperand(CurOp++).getReg();
25129 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25130 assert(RC->hasType(MVT::i32) && "Invalid destination!");
25131 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25132 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25134 MemOpndSlot = CurOp;
25136 MVT PVT = getPointerTy(MF->getDataLayout());
25137 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25138 "Invalid Pointer Size!");
25140 // For v = setjmp(buf), we generate
25143 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25144 // SjLjSetup restoreMBB
25150 // v = phi(main, restore)
25153 // if base pointer being used, load it from frame
25156 MachineBasicBlock *thisMBB = MBB;
25157 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25158 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25159 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25160 MF->insert(I, mainMBB);
25161 MF->insert(I, sinkMBB);
25162 MF->push_back(restoreMBB);
25163 restoreMBB->setHasAddressTaken();
25165 MachineInstrBuilder MIB;
25167 // Transfer the remainder of BB and its successor edges to sinkMBB.
25168 sinkMBB->splice(sinkMBB->begin(), MBB,
25169 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25170 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25173 unsigned PtrStoreOpc = 0;
25174 unsigned LabelReg = 0;
25175 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25176 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25177 !isPositionIndependent();
25179 // Prepare IP either in reg or imm.
25180 if (!UseImmLabel) {
25181 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25182 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25183 LabelReg = MRI.createVirtualRegister(PtrRC);
25184 if (Subtarget.is64Bit()) {
25185 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25189 .addMBB(restoreMBB)
25192 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25193 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25194 .addReg(XII->getGlobalBaseReg(MF))
25197 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25201 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25203 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25204 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25205 if (i == X86::AddrDisp)
25206 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
25208 MIB.addOperand(MI.getOperand(MemOpndSlot + i));
25211 MIB.addReg(LabelReg);
25213 MIB.addMBB(restoreMBB);
25214 MIB.setMemRefs(MMOBegin, MMOEnd);
25216 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
25217 .addMBB(restoreMBB);
25219 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25220 MIB.addRegMask(RegInfo->getNoPreservedMask());
25221 thisMBB->addSuccessor(mainMBB);
25222 thisMBB->addSuccessor(restoreMBB);
25226 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
25227 mainMBB->addSuccessor(sinkMBB);
25230 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
25231 TII->get(X86::PHI), DstReg)
25232 .addReg(mainDstReg).addMBB(mainMBB)
25233 .addReg(restoreDstReg).addMBB(restoreMBB);
25236 if (RegInfo->hasBasePointer(*MF)) {
25237 const bool Uses64BitFramePtr =
25238 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25239 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
25240 X86FI->setRestoreBasePointer(MF);
25241 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
25242 unsigned BasePtr = RegInfo->getBaseRegister();
25243 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
25244 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
25245 FramePtr, true, X86FI->getRestoreBasePointerOffset())
25246 .setMIFlag(MachineInstr::FrameSetup);
25248 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
25249 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25250 restoreMBB->addSuccessor(sinkMBB);
25252 MI.eraseFromParent();
25256 MachineBasicBlock *
25257 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
25258 MachineBasicBlock *MBB) const {
25259 DebugLoc DL = MI.getDebugLoc();
25260 MachineFunction *MF = MBB->getParent();
25261 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25262 MachineRegisterInfo &MRI = MF->getRegInfo();
25264 // Memory Reference
25265 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25266 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25268 MVT PVT = getPointerTy(MF->getDataLayout());
25269 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25270 "Invalid Pointer Size!");
25272 const TargetRegisterClass *RC =
25273 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25274 unsigned Tmp = MRI.createVirtualRegister(RC);
25275 // Since FP is only updated here but NOT referenced, it's treated as GPR.
25276 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25277 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
25278 unsigned SP = RegInfo->getStackRegister();
25280 MachineInstrBuilder MIB;
25282 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25283 const int64_t SPOffset = 2 * PVT.getStoreSize();
25285 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
25286 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
25289 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
25290 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
25291 MIB.addOperand(MI.getOperand(i));
25292 MIB.setMemRefs(MMOBegin, MMOEnd);
25294 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
25295 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25296 if (i == X86::AddrDisp)
25297 MIB.addDisp(MI.getOperand(i), LabelOffset);
25299 MIB.addOperand(MI.getOperand(i));
25301 MIB.setMemRefs(MMOBegin, MMOEnd);
25303 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
25304 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25305 if (i == X86::AddrDisp)
25306 MIB.addDisp(MI.getOperand(i), SPOffset);
25308 MIB.addOperand(MI.getOperand(i));
25310 MIB.setMemRefs(MMOBegin, MMOEnd);
25312 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
25314 MI.eraseFromParent();
25318 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
25319 MachineBasicBlock *MBB,
25320 MachineBasicBlock *DispatchBB,
25322 DebugLoc DL = MI.getDebugLoc();
25323 MachineFunction *MF = MBB->getParent();
25324 MachineRegisterInfo *MRI = &MF->getRegInfo();
25325 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25327 MVT PVT = getPointerTy(MF->getDataLayout());
25328 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
25333 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25334 !isPositionIndependent();
25337 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25339 const TargetRegisterClass *TRC =
25340 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25341 VR = MRI->createVirtualRegister(TRC);
25342 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25344 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
25346 if (Subtarget.is64Bit())
25347 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
25351 .addMBB(DispatchBB)
25354 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
25355 .addReg(0) /* XII->getGlobalBaseReg(MF) */
25358 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
25362 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
25363 addFrameReference(MIB, FI, 36);
25365 MIB.addMBB(DispatchBB);
25370 MachineBasicBlock *
25371 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
25372 MachineBasicBlock *BB) const {
25373 DebugLoc DL = MI.getDebugLoc();
25374 MachineFunction *MF = BB->getParent();
25375 MachineFrameInfo &MFI = MF->getFrameInfo();
25376 MachineRegisterInfo *MRI = &MF->getRegInfo();
25377 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25378 int FI = MFI.getFunctionContextIndex();
25380 // Get a mapping of the call site numbers to all of the landing pads they're
25381 // associated with.
25382 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
25383 unsigned MaxCSNum = 0;
25384 for (auto &MBB : *MF) {
25385 if (!MBB.isEHPad())
25388 MCSymbol *Sym = nullptr;
25389 for (const auto &MI : MBB) {
25390 if (MI.isDebugValue())
25393 assert(MI.isEHLabel() && "expected EH_LABEL");
25394 Sym = MI.getOperand(0).getMCSymbol();
25398 if (!MF->hasCallSiteLandingPad(Sym))
25401 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
25402 CallSiteNumToLPad[CSI].push_back(&MBB);
25403 MaxCSNum = std::max(MaxCSNum, CSI);
25407 // Get an ordered list of the machine basic blocks for the jump table.
25408 std::vector<MachineBasicBlock *> LPadList;
25409 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
25410 LPadList.reserve(CallSiteNumToLPad.size());
25412 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
25413 for (auto &LP : CallSiteNumToLPad[CSI]) {
25414 LPadList.push_back(LP);
25415 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
25419 assert(!LPadList.empty() &&
25420 "No landing pad destinations for the dispatch jump table!");
25422 // Create the MBBs for the dispatch code.
25424 // Shove the dispatch's address into the return slot in the function context.
25425 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
25426 DispatchBB->setIsEHPad(true);
25428 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
25429 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
25430 DispatchBB->addSuccessor(TrapBB);
25432 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
25433 DispatchBB->addSuccessor(DispContBB);
25436 MF->push_back(DispatchBB);
25437 MF->push_back(DispContBB);
25438 MF->push_back(TrapBB);
25440 // Insert code into the entry block that creates and registers the function
25442 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
25444 // Create the jump table and associated information
25445 MachineJumpTableInfo *JTI =
25446 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
25447 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
25449 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
25450 const X86RegisterInfo &RI = XII->getRegisterInfo();
25452 // Add a register mask with no preserved registers. This results in all
25453 // registers being marked as clobbered.
25454 if (RI.hasBasePointer(*MF)) {
25455 const bool FPIs64Bit =
25456 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25457 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
25458 MFI->setRestoreBasePointer(MF);
25460 unsigned FP = RI.getFrameRegister(*MF);
25461 unsigned BP = RI.getBaseRegister();
25462 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
25463 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
25464 MFI->getRestoreBasePointerOffset())
25465 .addRegMask(RI.getNoPreservedMask());
25467 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
25468 .addRegMask(RI.getNoPreservedMask());
25471 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25472 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
25474 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
25476 .addImm(LPadList.size());
25477 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
25479 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25480 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
25483 BuildMI(DispContBB, DL,
25484 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
25486 .addImm(Subtarget.is64Bit() ? 8 : 4)
25488 .addJumpTableIndex(MJTI)
25491 // Add the jump table entries as successors to the MBB.
25492 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
25493 for (auto &LP : LPadList)
25494 if (SeenMBBs.insert(LP).second)
25495 DispContBB->addSuccessor(LP);
25497 // N.B. the order the invoke BBs are processed in doesn't matter here.
25498 SmallVector<MachineBasicBlock *, 64> MBBLPads;
25499 const MCPhysReg *SavedRegs =
25500 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
25501 for (MachineBasicBlock *MBB : InvokeBBs) {
25502 // Remove the landing pad successor from the invoke block and replace it
25503 // with the new dispatch block.
25504 // Keep a copy of Successors since it's modified inside the loop.
25505 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
25507 // FIXME: Avoid quadratic complexity.
25508 for (auto MBBS : Successors) {
25509 if (MBBS->isEHPad()) {
25510 MBB->removeSuccessor(MBBS);
25511 MBBLPads.push_back(MBBS);
25515 MBB->addSuccessor(DispatchBB);
25517 // Find the invoke call and mark all of the callee-saved registers as
25518 // 'implicit defined' so that they're spilled. This prevents code from
25519 // moving instructions to before the EH block, where they will never be
25521 for (auto &II : reverse(*MBB)) {
25525 DenseMap<unsigned, bool> DefRegs;
25526 for (auto &MOp : II.operands())
25528 DefRegs[MOp.getReg()] = true;
25530 MachineInstrBuilder MIB(*MF, &II);
25531 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
25532 unsigned Reg = SavedRegs[RI];
25534 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
25541 // Mark all former landing pads as non-landing pads. The dispatch is the only
25542 // landing pad now.
25543 for (auto &LP : MBBLPads)
25544 LP->setIsEHPad(false);
25546 // The instruction is gone now.
25547 MI.eraseFromParent();
25551 MachineBasicBlock *
25552 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
25553 MachineBasicBlock *BB) const {
25554 MachineFunction *MF = BB->getParent();
25555 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25556 DebugLoc DL = MI.getDebugLoc();
25558 switch (MI.getOpcode()) {
25559 default: llvm_unreachable("Unexpected instr type to insert");
25560 case X86::TAILJMPd64:
25561 case X86::TAILJMPr64:
25562 case X86::TAILJMPm64:
25563 case X86::TAILJMPr64_REX:
25564 case X86::TAILJMPm64_REX:
25565 llvm_unreachable("TAILJMP64 would not be touched here.");
25566 case X86::TCRETURNdi64:
25567 case X86::TCRETURNri64:
25568 case X86::TCRETURNmi64:
25570 case X86::TLS_addr32:
25571 case X86::TLS_addr64:
25572 case X86::TLS_base_addr32:
25573 case X86::TLS_base_addr64:
25574 return EmitLoweredTLSAddr(MI, BB);
25575 case X86::CATCHRET:
25576 return EmitLoweredCatchRet(MI, BB);
25577 case X86::CATCHPAD:
25578 return EmitLoweredCatchPad(MI, BB);
25579 case X86::SEG_ALLOCA_32:
25580 case X86::SEG_ALLOCA_64:
25581 return EmitLoweredSegAlloca(MI, BB);
25582 case X86::TLSCall_32:
25583 case X86::TLSCall_64:
25584 return EmitLoweredTLSCall(MI, BB);
25585 case X86::CMOV_FR32:
25586 case X86::CMOV_FR64:
25587 case X86::CMOV_FR128:
25588 case X86::CMOV_GR8:
25589 case X86::CMOV_GR16:
25590 case X86::CMOV_GR32:
25591 case X86::CMOV_RFP32:
25592 case X86::CMOV_RFP64:
25593 case X86::CMOV_RFP80:
25594 case X86::CMOV_V2F64:
25595 case X86::CMOV_V2I64:
25596 case X86::CMOV_V4F32:
25597 case X86::CMOV_V4F64:
25598 case X86::CMOV_V4I64:
25599 case X86::CMOV_V16F32:
25600 case X86::CMOV_V8F32:
25601 case X86::CMOV_V8F64:
25602 case X86::CMOV_V8I64:
25603 case X86::CMOV_V8I1:
25604 case X86::CMOV_V16I1:
25605 case X86::CMOV_V32I1:
25606 case X86::CMOV_V64I1:
25607 return EmitLoweredSelect(MI, BB);
25609 case X86::RDFLAGS32:
25610 case X86::RDFLAGS64: {
25612 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
25613 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
25614 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
25615 // Permit reads of the FLAGS register without it being defined.
25616 // This intrinsic exists to read external processor state in flags, such as
25617 // the trap flag, interrupt flag, and direction flag, none of which are
25618 // modeled by the backend.
25619 Push->getOperand(2).setIsUndef();
25620 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
25622 MI.eraseFromParent(); // The pseudo is gone now.
25626 case X86::WRFLAGS32:
25627 case X86::WRFLAGS64: {
25629 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
25631 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
25632 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
25633 BuildMI(*BB, MI, DL, TII->get(PopF));
25635 MI.eraseFromParent(); // The pseudo is gone now.
25639 case X86::RELEASE_FADD32mr:
25640 case X86::RELEASE_FADD64mr:
25641 return EmitLoweredAtomicFP(MI, BB);
25643 case X86::FP32_TO_INT16_IN_MEM:
25644 case X86::FP32_TO_INT32_IN_MEM:
25645 case X86::FP32_TO_INT64_IN_MEM:
25646 case X86::FP64_TO_INT16_IN_MEM:
25647 case X86::FP64_TO_INT32_IN_MEM:
25648 case X86::FP64_TO_INT64_IN_MEM:
25649 case X86::FP80_TO_INT16_IN_MEM:
25650 case X86::FP80_TO_INT32_IN_MEM:
25651 case X86::FP80_TO_INT64_IN_MEM: {
25652 // Change the floating point control register to use "round towards zero"
25653 // mode when truncating to an integer value.
25654 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
25655 addFrameReference(BuildMI(*BB, MI, DL,
25656 TII->get(X86::FNSTCW16m)), CWFrameIdx);
25658 // Load the old value of the high byte of the control word...
25660 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
25661 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
25664 // Set the high part to be round to zero...
25665 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
25668 // Reload the modified control word now...
25669 addFrameReference(BuildMI(*BB, MI, DL,
25670 TII->get(X86::FLDCW16m)), CWFrameIdx);
25672 // Restore the memory image of control word to original value
25673 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
25676 // Get the X86 opcode to use.
25678 switch (MI.getOpcode()) {
25679 default: llvm_unreachable("illegal opcode!");
25680 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
25681 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
25682 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
25683 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
25684 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
25685 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
25686 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
25687 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
25688 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
25691 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25692 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
25693 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
25695 // Reload the original control word now.
25696 addFrameReference(BuildMI(*BB, MI, DL,
25697 TII->get(X86::FLDCW16m)), CWFrameIdx);
25699 MI.eraseFromParent(); // The pseudo instruction is gone now.
25702 // String/text processing lowering.
25703 case X86::PCMPISTRM128REG:
25704 case X86::VPCMPISTRM128REG:
25705 case X86::PCMPISTRM128MEM:
25706 case X86::VPCMPISTRM128MEM:
25707 case X86::PCMPESTRM128REG:
25708 case X86::VPCMPESTRM128REG:
25709 case X86::PCMPESTRM128MEM:
25710 case X86::VPCMPESTRM128MEM:
25711 assert(Subtarget.hasSSE42() &&
25712 "Target must have SSE4.2 or AVX features enabled");
25713 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
25715 // String/text processing lowering.
25716 case X86::PCMPISTRIREG:
25717 case X86::VPCMPISTRIREG:
25718 case X86::PCMPISTRIMEM:
25719 case X86::VPCMPISTRIMEM:
25720 case X86::PCMPESTRIREG:
25721 case X86::VPCMPESTRIREG:
25722 case X86::PCMPESTRIMEM:
25723 case X86::VPCMPESTRIMEM:
25724 assert(Subtarget.hasSSE42() &&
25725 "Target must have SSE4.2 or AVX features enabled");
25726 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
25728 // Thread synchronization.
25730 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
25731 case X86::MONITORX:
25732 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
25735 return emitWRPKRU(MI, BB, Subtarget);
25737 return emitRDPKRU(MI, BB, Subtarget);
25740 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
25742 case X86::VASTART_SAVE_XMM_REGS:
25743 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
25745 case X86::VAARG_64:
25746 return EmitVAARG64WithCustomInserter(MI, BB);
25748 case X86::EH_SjLj_SetJmp32:
25749 case X86::EH_SjLj_SetJmp64:
25750 return emitEHSjLjSetJmp(MI, BB);
25752 case X86::EH_SjLj_LongJmp32:
25753 case X86::EH_SjLj_LongJmp64:
25754 return emitEHSjLjLongJmp(MI, BB);
25756 case X86::Int_eh_sjlj_setup_dispatch:
25757 return EmitSjLjDispatchBlock(MI, BB);
25759 case TargetOpcode::STATEPOINT:
25760 // As an implementation detail, STATEPOINT shares the STACKMAP format at
25761 // this point in the process. We diverge later.
25762 return emitPatchPoint(MI, BB);
25764 case TargetOpcode::STACKMAP:
25765 case TargetOpcode::PATCHPOINT:
25766 return emitPatchPoint(MI, BB);
25768 case X86::LCMPXCHG8B: {
25769 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25770 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
25771 // requires a memory operand. If it happens that current architecture is
25772 // i686 and for current function we need a base pointer
25773 // - which is ESI for i686 - register allocator would not be able to
25774 // allocate registers for an address in form of X(%reg, %reg, Y)
25775 // - there never would be enough unreserved registers during regalloc
25776 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
25777 // We are giving a hand to register allocator by precomputing the address in
25778 // a new vreg using LEA.
25780 // If it is not i686 or there is no base pointer - nothing to do here.
25781 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
25784 // Even though this code does not necessarily needs the base pointer to
25785 // be ESI, we check for that. The reason: if this assert fails, there are
25786 // some changes happened in the compiler base pointer handling, which most
25787 // probably have to be addressed somehow here.
25788 assert(TRI->getBaseRegister() == X86::ESI &&
25789 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
25790 "base pointer in mind");
25792 MachineRegisterInfo &MRI = MF->getRegInfo();
25793 MVT SPTy = getPointerTy(MF->getDataLayout());
25794 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25795 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
25797 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25798 // Regalloc does not need any help when the memory operand of CMPXCHG8B
25799 // does not use index register.
25800 if (AM.IndexReg == X86::NoRegister)
25803 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
25804 // four operand definitions that are E[ABCD] registers. We skip them and
25805 // then insert the LEA.
25806 MachineBasicBlock::iterator MBBI(MI);
25807 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
25808 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
25811 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
25813 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
25817 case X86::LCMPXCHG16B:
25819 case X86::LCMPXCHG8B_SAVE_EBX:
25820 case X86::LCMPXCHG16B_SAVE_RBX: {
25822 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
25823 if (!BB->isLiveIn(BasePtr))
25824 BB->addLiveIn(BasePtr);
25830 //===----------------------------------------------------------------------===//
25831 // X86 Optimization Hooks
25832 //===----------------------------------------------------------------------===//
25834 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
25837 const SelectionDAG &DAG,
25838 unsigned Depth) const {
25839 unsigned BitWidth = KnownZero.getBitWidth();
25840 unsigned Opc = Op.getOpcode();
25841 assert((Opc >= ISD::BUILTIN_OP_END ||
25842 Opc == ISD::INTRINSIC_WO_CHAIN ||
25843 Opc == ISD::INTRINSIC_W_CHAIN ||
25844 Opc == ISD::INTRINSIC_VOID) &&
25845 "Should use MaskedValueIsZero if you don't know whether Op"
25846 " is a target node!");
25848 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
25862 // These nodes' second result is a boolean.
25863 if (Op.getResNo() == 0)
25866 case X86ISD::SETCC:
25867 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
25869 case X86ISD::MOVMSK: {
25870 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
25871 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
25874 case X86ISD::VZEXT: {
25875 SDValue N0 = Op.getOperand(0);
25876 unsigned NumElts = Op.getValueType().getVectorNumElements();
25877 unsigned InNumElts = N0.getValueType().getVectorNumElements();
25878 unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
25880 KnownZero = KnownOne = APInt(InBitWidth, 0);
25881 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25882 DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
25883 KnownOne = KnownOne.zext(BitWidth);
25884 KnownZero = KnownZero.zext(BitWidth);
25885 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
25891 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
25892 SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
25893 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
25894 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
25895 return Op.getScalarValueSizeInBits();
25897 if (Op.getOpcode() == X86ISD::VSEXT) {
25898 EVT VT = Op.getValueType();
25899 EVT SrcVT = Op.getOperand(0).getValueType();
25900 unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
25901 Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
25909 /// Returns true (and the GlobalValue and the offset) if the node is a
25910 /// GlobalAddress + offset.
25911 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
25912 const GlobalValue* &GA,
25913 int64_t &Offset) const {
25914 if (N->getOpcode() == X86ISD::Wrapper) {
25915 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
25916 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
25917 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
25921 return TargetLowering::isGAPlusOffset(N, GA, Offset);
25924 // Attempt to match a combined shuffle mask against supported unary shuffle
25926 // TODO: Investigate sharing more of this with shuffle lowering.
25927 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
25928 const X86Subtarget &Subtarget,
25929 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
25930 unsigned NumMaskElts = Mask.size();
25931 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
25932 bool FloatDomain = MaskVT.isFloatingPoint() ||
25933 (!Subtarget.hasAVX2() && MaskVT.is256BitVector());
25935 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
25936 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
25937 isUndefOrEqual(Mask[0], 0) &&
25938 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
25939 Shuffle = X86ISD::VZEXT_MOVL;
25940 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
25944 // Match against a VZEXT instruction.
25945 // TODO: Add 256/512-bit vector support.
25946 if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
25947 unsigned MaxScale = 64 / MaskEltSize;
25948 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
25950 unsigned NumDstElts = NumMaskElts / Scale;
25951 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
25952 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
25953 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
25957 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
25958 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
25959 Shuffle = X86ISD::VZEXT;
25965 // Check if we have SSE3 which will let us use MOVDDUP etc. The
25966 // instructions are no slower than UNPCKLPD but has the option to
25967 // fold the input operand into even an unaligned memory load.
25968 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
25969 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
25970 Shuffle = X86ISD::MOVDDUP;
25971 SrcVT = DstVT = MVT::v2f64;
25974 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
25975 Shuffle = X86ISD::MOVSLDUP;
25976 SrcVT = DstVT = MVT::v4f32;
25979 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
25980 Shuffle = X86ISD::MOVSHDUP;
25981 SrcVT = DstVT = MVT::v4f32;
25986 if (MaskVT.is256BitVector() && FloatDomain) {
25987 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
25988 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
25989 Shuffle = X86ISD::MOVDDUP;
25990 SrcVT = DstVT = MVT::v4f64;
25993 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
25994 Shuffle = X86ISD::MOVSLDUP;
25995 SrcVT = DstVT = MVT::v8f32;
25998 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
25999 Shuffle = X86ISD::MOVSHDUP;
26000 SrcVT = DstVT = MVT::v8f32;
26005 if (MaskVT.is512BitVector() && FloatDomain) {
26006 assert(Subtarget.hasAVX512() &&
26007 "AVX512 required for 512-bit vector shuffles");
26008 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26009 Shuffle = X86ISD::MOVDDUP;
26010 SrcVT = DstVT = MVT::v8f64;
26013 if (isTargetShuffleEquivalent(
26014 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26015 Shuffle = X86ISD::MOVSLDUP;
26016 SrcVT = DstVT = MVT::v16f32;
26019 if (isTargetShuffleEquivalent(
26020 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26021 Shuffle = X86ISD::MOVSHDUP;
26022 SrcVT = DstVT = MVT::v16f32;
26027 // Attempt to match against broadcast-from-vector.
26028 if (Subtarget.hasAVX2()) {
26029 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26030 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26031 SrcVT = DstVT = MaskVT;
26032 Shuffle = X86ISD::VBROADCAST;
26040 // Attempt to match a combined shuffle mask against supported unary immediate
26041 // permute instructions.
26042 // TODO: Investigate sharing more of this with shuffle lowering.
26043 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26044 const X86Subtarget &Subtarget,
26045 unsigned &Shuffle, MVT &ShuffleVT,
26046 unsigned &PermuteImm) {
26047 unsigned NumMaskElts = Mask.size();
26048 bool FloatDomain = MaskVT.isFloatingPoint();
26050 bool ContainsZeros = false;
26051 SmallBitVector Zeroable(NumMaskElts, false);
26052 for (unsigned i = 0; i != NumMaskElts; ++i) {
26054 Zeroable[i] = isUndefOrZero(M);
26055 ContainsZeros |= (M == SM_SentinelZero);
26058 // Attempt to match against byte/bit shifts.
26059 // FIXME: Add 512-bit support.
26060 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26061 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26062 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26063 MaskVT.getScalarSizeInBits(), Mask,
26064 0, Zeroable, Subtarget);
26065 if (0 < ShiftAmt) {
26066 PermuteImm = (unsigned)ShiftAmt;
26071 // Ensure we don't contain any zero elements.
26075 assert(llvm::all_of(Mask, [&](int M) {
26076 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26077 }) && "Expected unary shuffle");
26079 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26080 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26081 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26083 // Handle PSHUFLW/PSHUFHW repeated patterns.
26084 if (MaskScalarSizeInBits == 16) {
26085 SmallVector<int, 4> RepeatedMask;
26086 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26087 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26088 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26090 // PSHUFLW: permute lower 4 elements only.
26091 if (isUndefOrInRange(LoMask, 0, 4) &&
26092 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26093 Shuffle = X86ISD::PSHUFLW;
26094 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26095 PermuteImm = getV4X86ShuffleImm(LoMask);
26099 // PSHUFHW: permute upper 4 elements only.
26100 if (isUndefOrInRange(HiMask, 4, 8) &&
26101 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26102 // Offset the HiMask so that we can create the shuffle immediate.
26103 int OffsetHiMask[4];
26104 for (int i = 0; i != 4; ++i)
26105 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26107 Shuffle = X86ISD::PSHUFHW;
26108 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26109 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26118 // We only support permutation of 32/64 bit elements after this.
26119 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26122 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26123 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26124 if (FloatDomain && !Subtarget.hasAVX())
26127 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26128 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
26129 FloatDomain = true;
26131 // Check for lane crossing permutes.
26132 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26133 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26134 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26135 Shuffle = X86ISD::VPERMI;
26136 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26137 PermuteImm = getV4X86ShuffleImm(Mask);
26140 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26141 SmallVector<int, 4> RepeatedMask;
26142 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26143 Shuffle = X86ISD::VPERMI;
26144 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
26145 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
26152 // VPERMILPD can permute with a non-repeating shuffle.
26153 if (FloatDomain && MaskScalarSizeInBits == 64) {
26154 Shuffle = X86ISD::VPERMILPI;
26155 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
26157 for (int i = 0, e = Mask.size(); i != e; ++i) {
26159 if (M == SM_SentinelUndef)
26161 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
26162 PermuteImm |= (M & 1) << i;
26167 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
26168 SmallVector<int, 4> RepeatedMask;
26169 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
26172 // Narrow the repeated mask for 32-bit element permutes.
26173 SmallVector<int, 4> WordMask = RepeatedMask;
26174 if (MaskScalarSizeInBits == 64)
26175 scaleShuffleMask(2, RepeatedMask, WordMask);
26177 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
26178 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
26179 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
26180 PermuteImm = getV4X86ShuffleImm(WordMask);
26184 // Attempt to match a combined unary shuffle mask against supported binary
26185 // shuffle instructions.
26186 // TODO: Investigate sharing more of this with shuffle lowering.
26187 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26188 SDValue &V1, SDValue &V2,
26189 const X86Subtarget &Subtarget,
26190 unsigned &Shuffle, MVT &ShuffleVT,
26192 bool FloatDomain = MaskVT.isFloatingPoint();
26193 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
26195 if (MaskVT.is128BitVector()) {
26196 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
26198 Shuffle = X86ISD::MOVLHPS;
26199 ShuffleVT = MVT::v4f32;
26202 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
26204 Shuffle = X86ISD::MOVHLPS;
26205 ShuffleVT = MVT::v4f32;
26208 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
26209 (FloatDomain || !Subtarget.hasSSE41())) {
26211 Shuffle = X86ISD::MOVSD;
26212 ShuffleVT = MaskVT;
26215 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
26216 (FloatDomain || !Subtarget.hasSSE41())) {
26217 Shuffle = X86ISD::MOVSS;
26218 ShuffleVT = MaskVT;
26223 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
26224 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26225 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26226 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
26227 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
26228 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
26229 MVT LegalVT = MaskVT;
26230 if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
26231 LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
26233 SmallVector<int, 64> Unpckl, Unpckh;
26235 createUnpackShuffleMask(MaskVT, Unpckl, true, true);
26236 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26238 Shuffle = X86ISD::UNPCKL;
26239 ShuffleVT = LegalVT;
26243 createUnpackShuffleMask(MaskVT, Unpckh, false, true);
26244 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26246 Shuffle = X86ISD::UNPCKH;
26247 ShuffleVT = LegalVT;
26251 createUnpackShuffleMask(MaskVT, Unpckl, true, false);
26252 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26253 Shuffle = X86ISD::UNPCKL;
26254 ShuffleVT = LegalVT;
26258 createUnpackShuffleMask(MaskVT, Unpckh, false, false);
26259 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26260 Shuffle = X86ISD::UNPCKH;
26261 ShuffleVT = LegalVT;
26265 ShuffleVectorSDNode::commuteMask(Unpckl);
26266 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26268 Shuffle = X86ISD::UNPCKL;
26269 ShuffleVT = LegalVT;
26273 ShuffleVectorSDNode::commuteMask(Unpckh);
26274 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26276 Shuffle = X86ISD::UNPCKH;
26277 ShuffleVT = LegalVT;
26286 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26287 SDValue &V1, SDValue &V2,
26288 SDLoc &DL, SelectionDAG &DAG,
26289 const X86Subtarget &Subtarget,
26290 unsigned &Shuffle, MVT &ShuffleVT,
26291 unsigned &PermuteImm) {
26292 unsigned NumMaskElts = Mask.size();
26293 bool FloatDomain = MaskVT.isFloatingPoint();
26295 // Attempt to match against PALIGNR byte rotate.
26296 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26297 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26298 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
26299 if (0 < ByteRotation) {
26300 Shuffle = X86ISD::PALIGNR;
26301 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
26302 PermuteImm = ByteRotation;
26307 // Attempt to combine to X86ISD::BLENDI.
26308 if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
26309 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
26310 // Determine a type compatible with X86ISD::BLENDI.
26311 // TODO - add 16i16 support (requires lane duplication).
26312 MVT BlendVT = MaskVT;
26313 if (Subtarget.hasAVX2()) {
26314 if (BlendVT == MVT::v4i64)
26315 BlendVT = MVT::v8i32;
26316 else if (BlendVT == MVT::v2i64)
26317 BlendVT = MVT::v4i32;
26319 if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
26320 BlendVT = MVT::v8i16;
26321 else if (BlendVT == MVT::v4i64)
26322 BlendVT = MVT::v4f64;
26323 else if (BlendVT == MVT::v8i32)
26324 BlendVT = MVT::v8f32;
26327 unsigned BlendSize = BlendVT.getVectorNumElements();
26328 unsigned MaskRatio = BlendSize / NumMaskElts;
26330 // Can we blend with zero?
26331 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
26333 NumMaskElts <= BlendVT.getVectorNumElements()) {
26335 for (unsigned i = 0; i != BlendSize; ++i)
26336 if (Mask[i / MaskRatio] < 0)
26337 PermuteImm |= 1u << i;
26339 V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
26340 Shuffle = X86ISD::BLENDI;
26341 ShuffleVT = BlendVT;
26345 // Attempt to match as a binary blend.
26346 if (NumMaskElts <= BlendVT.getVectorNumElements()) {
26347 bool MatchBlend = true;
26348 for (int i = 0; i != (int)NumMaskElts; ++i) {
26350 if (M == SM_SentinelUndef)
26352 else if (M == SM_SentinelZero)
26353 MatchBlend = false;
26354 else if ((M != i) && (M != (i + (int)NumMaskElts)))
26355 MatchBlend = false;
26360 for (unsigned i = 0; i != BlendSize; ++i)
26361 if ((int)NumMaskElts <= Mask[i / MaskRatio])
26362 PermuteImm |= 1u << i;
26364 Shuffle = X86ISD::BLENDI;
26365 ShuffleVT = BlendVT;
26371 // Attempt to combine to INSERTPS.
26372 if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
26373 SmallBitVector Zeroable(4, false);
26374 for (unsigned i = 0; i != NumMaskElts; ++i)
26376 Zeroable[i] = true;
26378 if (Zeroable.any() &&
26379 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
26380 Shuffle = X86ISD::INSERTPS;
26381 ShuffleVT = MVT::v4f32;
26386 // Attempt to combine to SHUFPD.
26387 if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
26388 (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
26389 (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
26390 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
26391 Shuffle = X86ISD::SHUFP;
26392 ShuffleVT = MaskVT;
26397 // Attempt to combine to SHUFPS.
26398 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26399 (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26400 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
26401 SmallVector<int, 4> RepeatedMask;
26402 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
26403 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
26404 int M0 = RepeatedMask[Offset];
26405 int M1 = RepeatedMask[Offset + 1];
26407 if (isUndefInRange(RepeatedMask, Offset, 2)) {
26408 return DAG.getUNDEF(MaskVT);
26409 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
26410 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
26411 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
26412 return getZeroVector(MaskVT, Subtarget, DAG, DL);
26413 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
26414 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26415 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26417 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
26418 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26419 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26426 int ShufMask[4] = {-1, -1, -1, -1};
26427 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
26428 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
26433 Shuffle = X86ISD::SHUFP;
26434 ShuffleVT = MaskVT;
26435 PermuteImm = getV4X86ShuffleImm(ShufMask);
26444 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
26447 /// This is the leaf of the recursive combine below. When we have found some
26448 /// chain of single-use x86 shuffle instructions and accumulated the combined
26449 /// shuffle mask represented by them, this will try to pattern match that mask
26450 /// into either a single instruction if there is a special purpose instruction
26451 /// for this operation, or into a PSHUFB instruction which is a fully general
26452 /// instruction but should only be used to replace chains over a certain depth.
26453 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
26454 ArrayRef<int> BaseMask, int Depth,
26455 bool HasVariableMask, SelectionDAG &DAG,
26456 TargetLowering::DAGCombinerInfo &DCI,
26457 const X86Subtarget &Subtarget) {
26458 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
26459 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
26460 "Unexpected number of shuffle inputs!");
26462 // Find the inputs that enter the chain. Note that multiple uses are OK
26463 // here, we're not going to remove the operands we find.
26464 bool UnaryShuffle = (Inputs.size() == 1);
26465 SDValue V1 = peekThroughBitcasts(Inputs[0]);
26466 SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
26468 MVT VT1 = V1.getSimpleValueType();
26469 MVT VT2 = V2.getSimpleValueType();
26470 MVT RootVT = Root.getSimpleValueType();
26471 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
26472 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
26473 "Vector size mismatch");
26478 unsigned NumBaseMaskElts = BaseMask.size();
26479 if (NumBaseMaskElts == 1) {
26480 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
26481 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26486 unsigned RootSizeInBits = RootVT.getSizeInBits();
26487 unsigned NumRootElts = RootVT.getVectorNumElements();
26488 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
26489 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
26490 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
26492 // Don't combine if we are a AVX512/EVEX target and the mask element size
26493 // is different from the root element size - this would prevent writemasks
26494 // from being reused.
26495 // TODO - this currently prevents all lane shuffles from occurring.
26496 // TODO - check for writemasks usage instead of always preventing combining.
26497 // TODO - attempt to narrow Mask back to writemask size.
26498 bool IsEVEXShuffle =
26499 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
26500 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
26503 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
26505 // Handle 128-bit lane shuffles of 256-bit vectors.
26506 // TODO - this should support binary shuffles.
26507 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
26508 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
26509 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
26510 return false; // Nothing to do!
26511 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26512 unsigned PermMask = 0;
26513 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
26514 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
26516 Res = DAG.getBitcast(ShuffleVT, V1);
26517 DCI.AddToWorklist(Res.getNode());
26518 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
26519 DAG.getUNDEF(ShuffleVT),
26520 DAG.getConstant(PermMask, DL, MVT::i8));
26521 DCI.AddToWorklist(Res.getNode());
26522 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26527 // For masks that have been widened to 128-bit elements or more,
26528 // narrow back down to 64-bit elements.
26529 SmallVector<int, 64> Mask;
26530 if (BaseMaskEltSizeInBits > 64) {
26531 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
26532 int MaskScale = BaseMaskEltSizeInBits / 64;
26533 scaleShuffleMask(MaskScale, BaseMask, Mask);
26535 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
26538 unsigned NumMaskElts = Mask.size();
26539 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
26541 // Determine the effective mask value type.
26542 FloatDomain &= (32 <= MaskEltSizeInBits);
26543 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
26544 : MVT::getIntegerVT(MaskEltSizeInBits);
26545 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
26547 // Only allow legal mask types.
26548 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
26551 // Attempt to match the mask against known shuffle patterns.
26552 MVT ShuffleSrcVT, ShuffleVT;
26553 unsigned Shuffle, PermuteImm;
26555 if (UnaryShuffle) {
26556 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
26557 // directly if we don't shuffle the lower element and we shuffle the upper
26558 // (zero) elements within themselves.
26559 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
26560 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
26561 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
26562 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
26563 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
26564 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
26565 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26571 if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT,
26573 if (Depth == 1 && Root.getOpcode() == Shuffle)
26574 return false; // Nothing to do!
26575 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26576 return false; // AVX512 Writemask clash.
26577 Res = DAG.getBitcast(ShuffleSrcVT, V1);
26578 DCI.AddToWorklist(Res.getNode());
26579 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
26580 DCI.AddToWorklist(Res.getNode());
26581 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26586 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle,
26587 ShuffleVT, PermuteImm)) {
26588 if (Depth == 1 && Root.getOpcode() == Shuffle)
26589 return false; // Nothing to do!
26590 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26591 return false; // AVX512 Writemask clash.
26592 Res = DAG.getBitcast(ShuffleVT, V1);
26593 DCI.AddToWorklist(Res.getNode());
26594 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
26595 DAG.getConstant(PermuteImm, DL, MVT::i8));
26596 DCI.AddToWorklist(Res.getNode());
26597 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26603 if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle,
26604 ShuffleVT, UnaryShuffle)) {
26605 if (Depth == 1 && Root.getOpcode() == Shuffle)
26606 return false; // Nothing to do!
26607 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26608 return false; // AVX512 Writemask clash.
26609 V1 = DAG.getBitcast(ShuffleVT, V1);
26610 DCI.AddToWorklist(V1.getNode());
26611 V2 = DAG.getBitcast(ShuffleVT, V2);
26612 DCI.AddToWorklist(V2.getNode());
26613 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
26614 DCI.AddToWorklist(Res.getNode());
26615 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26620 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget,
26621 Shuffle, ShuffleVT, PermuteImm)) {
26622 if (Depth == 1 && Root.getOpcode() == Shuffle)
26623 return false; // Nothing to do!
26624 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26625 return false; // AVX512 Writemask clash.
26626 V1 = DAG.getBitcast(ShuffleVT, V1);
26627 DCI.AddToWorklist(V1.getNode());
26628 V2 = DAG.getBitcast(ShuffleVT, V2);
26629 DCI.AddToWorklist(V2.getNode());
26630 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
26631 DAG.getConstant(PermuteImm, DL, MVT::i8));
26632 DCI.AddToWorklist(Res.getNode());
26633 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26638 // Don't try to re-form single instruction chains under any circumstances now
26639 // that we've done encoding canonicalization for them.
26643 bool MaskContainsZeros =
26644 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
26646 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
26647 // If we have a single input lane-crossing shuffle then lower to VPERMV.
26648 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26649 ((Subtarget.hasAVX2() &&
26650 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26651 (Subtarget.hasAVX512() &&
26652 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26653 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26654 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26655 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26656 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26657 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26658 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26659 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26660 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26661 DCI.AddToWorklist(VPermMask.getNode());
26662 Res = DAG.getBitcast(MaskVT, V1);
26663 DCI.AddToWorklist(Res.getNode());
26664 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
26665 DCI.AddToWorklist(Res.getNode());
26666 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26671 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
26672 // vector as the second source.
26673 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26674 ((Subtarget.hasAVX512() &&
26675 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26676 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26677 (Subtarget.hasVLX() &&
26678 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26679 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26680 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26681 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26682 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26683 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26684 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
26685 for (unsigned i = 0; i != NumMaskElts; ++i)
26686 if (Mask[i] == SM_SentinelZero)
26687 Mask[i] = NumMaskElts + i;
26689 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26690 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26691 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26692 DCI.AddToWorklist(VPermMask.getNode());
26693 Res = DAG.getBitcast(MaskVT, V1);
26694 DCI.AddToWorklist(Res.getNode());
26695 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
26696 DCI.AddToWorklist(Zero.getNode());
26697 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
26698 DCI.AddToWorklist(Res.getNode());
26699 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26704 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
26705 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26706 ((Subtarget.hasAVX512() &&
26707 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26708 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26709 (Subtarget.hasVLX() &&
26710 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26711 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26712 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26713 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26714 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26715 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26716 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26717 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26718 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26719 DCI.AddToWorklist(VPermMask.getNode());
26720 V1 = DAG.getBitcast(MaskVT, V1);
26721 DCI.AddToWorklist(V1.getNode());
26722 V2 = DAG.getBitcast(MaskVT, V2);
26723 DCI.AddToWorklist(V2.getNode());
26724 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
26725 DCI.AddToWorklist(Res.getNode());
26726 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26733 // See if we can combine a single input shuffle with zeros to a bit-mask,
26734 // which is much simpler than any shuffle.
26735 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
26736 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
26737 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
26738 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
26739 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
26740 SmallBitVector UndefElts(NumMaskElts, false);
26741 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
26742 for (unsigned i = 0; i != NumMaskElts; ++i) {
26744 if (M == SM_SentinelUndef) {
26745 UndefElts[i] = true;
26748 if (M == SM_SentinelZero)
26750 EltBits[i] = AllOnes;
26752 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
26753 DCI.AddToWorklist(BitMask.getNode());
26754 Res = DAG.getBitcast(MaskVT, V1);
26755 DCI.AddToWorklist(Res.getNode());
26756 unsigned AndOpcode =
26757 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
26758 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
26759 DCI.AddToWorklist(Res.getNode());
26760 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26765 // If we have a single input shuffle with different shuffle patterns in the
26766 // the 128-bit lanes use the variable mask to VPERMILPS.
26767 // TODO Combine other mask types at higher depths.
26768 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
26769 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26770 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
26771 SmallVector<SDValue, 16> VPermIdx;
26772 for (int M : Mask) {
26774 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
26775 VPermIdx.push_back(Idx);
26777 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
26778 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
26779 DCI.AddToWorklist(VPermMask.getNode());
26780 Res = DAG.getBitcast(MaskVT, V1);
26781 DCI.AddToWorklist(Res.getNode());
26782 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
26783 DCI.AddToWorklist(Res.getNode());
26784 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26789 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
26790 // to VPERMIL2PD/VPERMIL2PS.
26791 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
26792 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
26793 MaskVT == MVT::v8f32)) {
26794 // VPERMIL2 Operation.
26795 // Bits[3] - Match Bit.
26796 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
26797 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
26798 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
26799 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
26800 SmallVector<int, 8> VPerm2Idx;
26801 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
26802 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
26803 unsigned M2ZImm = 0;
26804 for (int M : Mask) {
26805 if (M == SM_SentinelUndef) {
26806 VPerm2Idx.push_back(-1);
26809 if (M == SM_SentinelZero) {
26811 VPerm2Idx.push_back(8);
26814 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
26815 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
26816 VPerm2Idx.push_back(Index);
26818 V1 = DAG.getBitcast(MaskVT, V1);
26819 DCI.AddToWorklist(V1.getNode());
26820 V2 = DAG.getBitcast(MaskVT, V2);
26821 DCI.AddToWorklist(V2.getNode());
26822 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
26823 DCI.AddToWorklist(VPerm2MaskOp.getNode());
26824 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
26825 DAG.getConstant(M2ZImm, DL, MVT::i8));
26826 DCI.AddToWorklist(Res.getNode());
26827 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26832 // If we have 3 or more shuffle instructions or a chain involving a variable
26833 // mask, we can replace them with a single PSHUFB instruction profitably.
26834 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
26835 // instructions, but in practice PSHUFB tends to be *very* fast so we're
26836 // more aggressive.
26837 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26838 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26839 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
26840 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
26841 SmallVector<SDValue, 16> PSHUFBMask;
26842 int NumBytes = RootVT.getSizeInBits() / 8;
26843 int Ratio = NumBytes / NumMaskElts;
26844 for (int i = 0; i < NumBytes; ++i) {
26845 int M = Mask[i / Ratio];
26846 if (M == SM_SentinelUndef) {
26847 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
26850 if (M == SM_SentinelZero) {
26851 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
26854 M = Ratio * M + i % Ratio;
26855 assert ((M / 16) == (i / 16) && "Lane crossing detected");
26856 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
26858 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
26859 Res = DAG.getBitcast(ByteVT, V1);
26860 DCI.AddToWorklist(Res.getNode());
26861 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
26862 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
26863 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
26864 DCI.AddToWorklist(Res.getNode());
26865 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26870 // With XOP, if we have a 128-bit binary input shuffle we can always combine
26871 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
26872 // slower than PSHUFB on targets that support both.
26873 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
26874 Subtarget.hasXOP()) {
26875 // VPPERM Mask Operation
26876 // Bits[4:0] - Byte Index (0 - 31)
26877 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
26878 SmallVector<SDValue, 16> VPPERMMask;
26880 int Ratio = NumBytes / NumMaskElts;
26881 for (int i = 0; i < NumBytes; ++i) {
26882 int M = Mask[i / Ratio];
26883 if (M == SM_SentinelUndef) {
26884 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
26887 if (M == SM_SentinelZero) {
26888 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
26891 M = Ratio * M + i % Ratio;
26892 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
26894 MVT ByteVT = MVT::v16i8;
26895 V1 = DAG.getBitcast(ByteVT, V1);
26896 DCI.AddToWorklist(V1.getNode());
26897 V2 = DAG.getBitcast(ByteVT, V2);
26898 DCI.AddToWorklist(V2.getNode());
26899 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
26900 DCI.AddToWorklist(VPPERMMaskOp.getNode());
26901 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
26902 DCI.AddToWorklist(Res.getNode());
26903 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26908 // Failed to find any combines.
26912 // Attempt to constant fold all of the constant source ops.
26913 // Returns true if the entire shuffle is folded to a constant.
26914 // TODO: Extend this to merge multiple constant Ops and update the mask.
26915 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
26916 ArrayRef<int> Mask, SDValue Root,
26917 bool HasVariableMask, SelectionDAG &DAG,
26918 TargetLowering::DAGCombinerInfo &DCI,
26919 const X86Subtarget &Subtarget) {
26920 MVT VT = Root.getSimpleValueType();
26922 unsigned SizeInBits = VT.getSizeInBits();
26923 unsigned NumMaskElts = Mask.size();
26924 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
26925 unsigned NumOps = Ops.size();
26927 // Extract constant bits from each source op.
26928 bool OneUseConstantOp = false;
26929 SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
26930 SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
26931 for (unsigned i = 0; i != NumOps; ++i) {
26932 SDValue SrcOp = Ops[i];
26933 OneUseConstantOp |= SrcOp.hasOneUse();
26934 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
26939 // Only fold if at least one of the constants is only used once or
26940 // the combined shuffle has included a variable mask shuffle, this
26941 // is to avoid constant pool bloat.
26942 if (!OneUseConstantOp && !HasVariableMask)
26945 // Shuffle the constant bits according to the mask.
26946 SmallBitVector UndefElts(NumMaskElts, false);
26947 SmallBitVector ZeroElts(NumMaskElts, false);
26948 SmallBitVector ConstantElts(NumMaskElts, false);
26949 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
26950 APInt::getNullValue(MaskSizeInBits));
26951 for (unsigned i = 0; i != NumMaskElts; ++i) {
26953 if (M == SM_SentinelUndef) {
26954 UndefElts[i] = true;
26956 } else if (M == SM_SentinelZero) {
26957 ZeroElts[i] = true;
26960 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
26962 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
26963 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
26965 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
26966 if (SrcUndefElts[SrcMaskIdx]) {
26967 UndefElts[i] = true;
26971 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
26972 APInt &Bits = SrcEltBits[SrcMaskIdx];
26974 ZeroElts[i] = true;
26978 ConstantElts[i] = true;
26979 ConstantBitData[i] = Bits;
26981 assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
26983 // Create the constant data.
26985 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
26986 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
26988 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
26990 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
26993 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
26994 DCI.AddToWorklist(CstOp.getNode());
26995 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
26999 /// \brief Fully generic combining of x86 shuffle instructions.
27001 /// This should be the last combine run over the x86 shuffle instructions. Once
27002 /// they have been fully optimized, this will recursively consider all chains
27003 /// of single-use shuffle instructions, build a generic model of the cumulative
27004 /// shuffle operation, and check for simpler instructions which implement this
27005 /// operation. We use this primarily for two purposes:
27007 /// 1) Collapse generic shuffles to specialized single instructions when
27008 /// equivalent. In most cases, this is just an encoding size win, but
27009 /// sometimes we will collapse multiple generic shuffles into a single
27010 /// special-purpose shuffle.
27011 /// 2) Look for sequences of shuffle instructions with 3 or more total
27012 /// instructions, and replace them with the slightly more expensive SSSE3
27013 /// PSHUFB instruction if available. We do this as the last combining step
27014 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27015 /// a suitable short sequence of other instructions. The PSHUFB will either
27016 /// use a register or have to read from memory and so is slightly (but only
27017 /// slightly) more expensive than the other shuffle instructions.
27019 /// Because this is inherently a quadratic operation (for each shuffle in
27020 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27021 /// This should never be an issue in practice as the shuffle lowering doesn't
27022 /// produce sequences of more than 8 instructions.
27024 /// FIXME: We will currently miss some cases where the redundant shuffling
27025 /// would simplify under the threshold for PSHUFB formation because of
27026 /// combine-ordering. To fix this, we should do the redundant instruction
27027 /// combining in this recursive walk.
27028 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27029 int SrcOpIndex, SDValue Root,
27030 ArrayRef<int> RootMask,
27031 int Depth, bool HasVariableMask,
27033 TargetLowering::DAGCombinerInfo &DCI,
27034 const X86Subtarget &Subtarget) {
27035 // Bound the depth of our recursive combine because this is ultimately
27036 // quadratic in nature.
27040 // Directly rip through bitcasts to find the underlying operand.
27041 SDValue Op = SrcOps[SrcOpIndex];
27042 Op = peekThroughOneUseBitcasts(Op);
27044 MVT VT = Op.getSimpleValueType();
27045 if (!VT.isVector())
27046 return false; // Bail if we hit a non-vector.
27048 assert(Root.getSimpleValueType().isVector() &&
27049 "Shuffles operate on vector types!");
27050 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27051 "Can only combine shuffles of the same vector register size.");
27053 // Extract target shuffle mask and resolve sentinels and inputs.
27054 SDValue Input0, Input1;
27055 SmallVector<int, 16> OpMask;
27056 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
27059 // Add the inputs to the Ops list, avoiding duplicates.
27060 SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
27062 int InputIdx0 = -1, InputIdx1 = -1;
27063 for (int i = 0, e = Ops.size(); i < e; ++i) {
27064 SDValue BC = peekThroughBitcasts(Ops[i]);
27065 if (Input0 && BC == peekThroughBitcasts(Input0))
27067 if (Input1 && BC == peekThroughBitcasts(Input1))
27071 if (Input0 && InputIdx0 < 0) {
27072 InputIdx0 = SrcOpIndex;
27073 Ops[SrcOpIndex] = Input0;
27075 if (Input1 && InputIdx1 < 0) {
27076 InputIdx1 = Ops.size();
27077 Ops.push_back(Input1);
27080 assert(((RootMask.size() > OpMask.size() &&
27081 RootMask.size() % OpMask.size() == 0) ||
27082 (OpMask.size() > RootMask.size() &&
27083 OpMask.size() % RootMask.size() == 0) ||
27084 OpMask.size() == RootMask.size()) &&
27085 "The smaller number of elements must divide the larger.");
27086 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27087 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27088 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27089 assert(((RootRatio == 1 && OpRatio == 1) ||
27090 (RootRatio == 1) != (OpRatio == 1)) &&
27091 "Must not have a ratio for both incoming and op masks!");
27093 SmallVector<int, 16> Mask;
27094 Mask.reserve(MaskWidth);
27096 // Merge this shuffle operation's mask into our accumulated mask. Note that
27097 // this shuffle's mask will be the first applied to the input, followed by the
27098 // root mask to get us all the way to the root value arrangement. The reason
27099 // for this order is that we are recursing up the operation chain.
27100 for (int i = 0; i < MaskWidth; ++i) {
27101 int RootIdx = i / RootRatio;
27102 if (RootMask[RootIdx] < 0) {
27103 // This is a zero or undef lane, we're done.
27104 Mask.push_back(RootMask[RootIdx]);
27108 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27110 // Just insert the scaled root mask value if it references an input other
27111 // than the SrcOp we're currently inserting.
27112 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27113 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27114 Mask.push_back(RootMaskedIdx);
27118 RootMaskedIdx %= MaskWidth;
27120 int OpIdx = RootMaskedIdx / OpRatio;
27121 if (OpMask[OpIdx] < 0) {
27122 // The incoming lanes are zero or undef, it doesn't matter which ones we
27124 Mask.push_back(OpMask[OpIdx]);
27128 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27129 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27130 OpMaskedIdx %= MaskWidth;
27132 if (OpMask[OpIdx] < (int)OpMask.size()) {
27133 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27134 OpMaskedIdx += InputIdx0 * MaskWidth;
27136 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27137 OpMaskedIdx += InputIdx1 * MaskWidth;
27140 Mask.push_back(OpMaskedIdx);
27143 // Handle the all undef/zero cases early.
27144 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27145 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27148 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27149 // TODO - should we handle the mixed zero/undef case as well? Just returning
27150 // a zero mask will lose information on undef elements possibly reducing
27151 // future combine possibilities.
27152 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27153 Subtarget, DAG, SDLoc(Root)));
27157 // Remove unused shuffle source ops.
27158 SmallVector<SDValue, 8> UsedOps;
27159 for (int i = 0, e = Ops.size(); i < e; ++i) {
27160 int lo = UsedOps.size() * MaskWidth;
27161 int hi = lo + MaskWidth;
27162 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
27163 UsedOps.push_back(Ops[i]);
27166 for (int &M : Mask)
27170 assert(!UsedOps.empty() && "Shuffle with no inputs detected");
27173 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27175 // See if we can recurse into each shuffle source op (if it's a target shuffle).
27176 for (int i = 0, e = Ops.size(); i < e; ++i)
27177 if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
27178 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
27179 HasVariableMask, DAG, DCI, Subtarget))
27182 // Attempt to constant fold all of the constant source ops.
27183 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
27187 // We can only combine unary and binary shuffle mask cases.
27188 if (Ops.size() > 2)
27191 // Minor canonicalization of the accumulated shuffle mask to make it easier
27192 // to match below. All this does is detect masks with sequential pairs of
27193 // elements, and shrink them to the half-width mask. It does this in a loop
27194 // so it will reduce the size of the mask to the minimal width mask which
27195 // performs an equivalent shuffle.
27196 SmallVector<int, 16> WidenedMask;
27197 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
27198 Mask = std::move(WidenedMask);
27201 // Canonicalization of binary shuffle masks to improve pattern matching by
27202 // commuting the inputs.
27203 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
27204 ShuffleVectorSDNode::commuteMask(Mask);
27205 std::swap(Ops[0], Ops[1]);
27208 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
27212 /// \brief Get the PSHUF-style mask from PSHUF node.
27214 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
27215 /// PSHUF-style masks that can be reused with such instructions.
27216 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
27217 MVT VT = N.getSimpleValueType();
27218 SmallVector<int, 4> Mask;
27219 SmallVector<SDValue, 2> Ops;
27222 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
27226 // If we have more than 128-bits, only the low 128-bits of shuffle mask
27227 // matter. Check that the upper masks are repeats and remove them.
27228 if (VT.getSizeInBits() > 128) {
27229 int LaneElts = 128 / VT.getScalarSizeInBits();
27231 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
27232 for (int j = 0; j < LaneElts; ++j)
27233 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
27234 "Mask doesn't repeat in high 128-bit lanes!");
27236 Mask.resize(LaneElts);
27239 switch (N.getOpcode()) {
27240 case X86ISD::PSHUFD:
27242 case X86ISD::PSHUFLW:
27245 case X86ISD::PSHUFHW:
27246 Mask.erase(Mask.begin(), Mask.begin() + 4);
27247 for (int &M : Mask)
27251 llvm_unreachable("No valid shuffle instruction found!");
27255 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
27257 /// We walk up the chain and look for a combinable shuffle, skipping over
27258 /// shuffles that we could hoist this shuffle's transformation past without
27259 /// altering anything.
27261 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
27263 TargetLowering::DAGCombinerInfo &DCI) {
27264 assert(N.getOpcode() == X86ISD::PSHUFD &&
27265 "Called with something other than an x86 128-bit half shuffle!");
27268 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
27269 // of the shuffles in the chain so that we can form a fresh chain to replace
27271 SmallVector<SDValue, 8> Chain;
27272 SDValue V = N.getOperand(0);
27273 for (; V.hasOneUse(); V = V.getOperand(0)) {
27274 switch (V.getOpcode()) {
27276 return SDValue(); // Nothing combined!
27279 // Skip bitcasts as we always know the type for the target specific
27283 case X86ISD::PSHUFD:
27284 // Found another dword shuffle.
27287 case X86ISD::PSHUFLW:
27288 // Check that the low words (being shuffled) are the identity in the
27289 // dword shuffle, and the high words are self-contained.
27290 if (Mask[0] != 0 || Mask[1] != 1 ||
27291 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
27294 Chain.push_back(V);
27297 case X86ISD::PSHUFHW:
27298 // Check that the high words (being shuffled) are the identity in the
27299 // dword shuffle, and the low words are self-contained.
27300 if (Mask[2] != 2 || Mask[3] != 3 ||
27301 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
27304 Chain.push_back(V);
27307 case X86ISD::UNPCKL:
27308 case X86ISD::UNPCKH:
27309 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
27310 // shuffle into a preceding word shuffle.
27311 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
27312 V.getSimpleValueType().getVectorElementType() != MVT::i16)
27315 // Search for a half-shuffle which we can combine with.
27316 unsigned CombineOp =
27317 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
27318 if (V.getOperand(0) != V.getOperand(1) ||
27319 !V->isOnlyUserOf(V.getOperand(0).getNode()))
27321 Chain.push_back(V);
27322 V = V.getOperand(0);
27324 switch (V.getOpcode()) {
27326 return SDValue(); // Nothing to combine.
27328 case X86ISD::PSHUFLW:
27329 case X86ISD::PSHUFHW:
27330 if (V.getOpcode() == CombineOp)
27333 Chain.push_back(V);
27337 V = V.getOperand(0);
27341 } while (V.hasOneUse());
27344 // Break out of the loop if we break out of the switch.
27348 if (!V.hasOneUse())
27349 // We fell out of the loop without finding a viable combining instruction.
27352 // Merge this node's mask and our incoming mask.
27353 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27354 for (int &M : Mask)
27356 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
27357 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27359 // Rebuild the chain around this new shuffle.
27360 while (!Chain.empty()) {
27361 SDValue W = Chain.pop_back_val();
27363 if (V.getValueType() != W.getOperand(0).getValueType())
27364 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
27366 switch (W.getOpcode()) {
27368 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
27370 case X86ISD::UNPCKL:
27371 case X86ISD::UNPCKH:
27372 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
27375 case X86ISD::PSHUFD:
27376 case X86ISD::PSHUFLW:
27377 case X86ISD::PSHUFHW:
27378 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
27382 if (V.getValueType() != N.getValueType())
27383 V = DAG.getBitcast(N.getValueType(), V);
27385 // Return the new chain to replace N.
27389 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
27392 /// We walk up the chain, skipping shuffles of the other half and looking
27393 /// through shuffles which switch halves trying to find a shuffle of the same
27394 /// pair of dwords.
27395 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
27397 TargetLowering::DAGCombinerInfo &DCI) {
27399 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
27400 "Called with something other than an x86 128-bit half shuffle!");
27402 unsigned CombineOpcode = N.getOpcode();
27404 // Walk up a single-use chain looking for a combinable shuffle.
27405 SDValue V = N.getOperand(0);
27406 for (; V.hasOneUse(); V = V.getOperand(0)) {
27407 switch (V.getOpcode()) {
27409 return false; // Nothing combined!
27412 // Skip bitcasts as we always know the type for the target specific
27416 case X86ISD::PSHUFLW:
27417 case X86ISD::PSHUFHW:
27418 if (V.getOpcode() == CombineOpcode)
27421 // Other-half shuffles are no-ops.
27424 // Break out of the loop if we break out of the switch.
27428 if (!V.hasOneUse())
27429 // We fell out of the loop without finding a viable combining instruction.
27432 // Combine away the bottom node as its shuffle will be accumulated into
27433 // a preceding shuffle.
27434 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27436 // Record the old value.
27439 // Merge this node's mask and our incoming mask (adjusted to account for all
27440 // the pshufd instructions encountered).
27441 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27442 for (int &M : Mask)
27444 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
27445 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27447 // Check that the shuffles didn't cancel each other out. If not, we need to
27448 // combine to the new one.
27450 // Replace the combinable shuffle with the combined one, updating all users
27451 // so that we re-evaluate the chain here.
27452 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
27457 /// \brief Try to combine x86 target specific shuffles.
27458 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
27459 TargetLowering::DAGCombinerInfo &DCI,
27460 const X86Subtarget &Subtarget) {
27462 MVT VT = N.getSimpleValueType();
27463 SmallVector<int, 4> Mask;
27465 unsigned Opcode = N.getOpcode();
27467 case X86ISD::PSHUFD:
27468 case X86ISD::PSHUFLW:
27469 case X86ISD::PSHUFHW:
27470 Mask = getPSHUFShuffleMask(N);
27471 assert(Mask.size() == 4);
27473 case X86ISD::UNPCKL: {
27474 auto Op0 = N.getOperand(0);
27475 auto Op1 = N.getOperand(1);
27476 unsigned Opcode0 = Op0.getOpcode();
27477 unsigned Opcode1 = Op1.getOpcode();
27479 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
27480 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
27481 // TODO: Add other horizontal operations as required.
27482 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
27483 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
27485 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
27486 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
27487 // moves upper half elements into the lower half part. For example:
27489 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
27491 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
27493 // will be combined to:
27495 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
27497 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
27498 // happen due to advanced instructions.
27499 if (!VT.is128BitVector())
27502 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
27503 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
27505 unsigned NumElts = VT.getVectorNumElements();
27506 SmallVector<int, 8> ExpectedMask(NumElts, -1);
27507 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
27510 auto ShufOp = Op1.getOperand(0);
27511 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
27512 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
27516 case X86ISD::BLENDI: {
27517 SDValue V0 = N->getOperand(0);
27518 SDValue V1 = N->getOperand(1);
27519 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
27520 "Unexpected input vector types");
27522 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
27523 // operands and changing the mask to 1. This saves us a bunch of
27524 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
27525 // x86InstrInfo knows how to commute this back after instruction selection
27526 // if it would help register allocation.
27528 // TODO: If optimizing for size or a processor that doesn't suffer from
27529 // partial register update stalls, this should be transformed into a MOVSD
27530 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
27532 if (VT == MVT::v2f64)
27533 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
27534 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
27535 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
27536 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
27541 case X86ISD::MOVSD:
27542 case X86ISD::MOVSS: {
27543 bool isFloat = VT.isFloatingPoint();
27544 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
27545 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
27546 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
27547 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
27548 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
27549 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
27550 assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
27552 // We often lower to MOVSD/MOVSS from integer as well as native float
27553 // types; remove unnecessary domain-crossing bitcasts if we can to make it
27554 // easier to combine shuffles later on. We've already accounted for the
27555 // domain switching cost when we decided to lower with it.
27556 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
27557 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
27558 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
27559 V0 = DAG.getBitcast(NewVT, V0);
27560 V1 = DAG.getBitcast(NewVT, V1);
27561 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
27566 case X86ISD::INSERTPS: {
27567 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
27568 SDValue Op0 = N.getOperand(0);
27569 SDValue Op1 = N.getOperand(1);
27570 SDValue Op2 = N.getOperand(2);
27571 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
27572 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
27573 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
27574 unsigned ZeroMask = InsertPSMask & 0xF;
27576 // If we zero out all elements from Op0 then we don't need to reference it.
27577 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
27578 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
27579 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27581 // If we zero out the element from Op1 then we don't need to reference it.
27582 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
27583 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27584 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27586 // Attempt to merge insertps Op1 with an inner target shuffle node.
27587 SmallVector<int, 8> TargetMask1;
27588 SmallVector<SDValue, 2> Ops1;
27589 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
27590 int M = TargetMask1[SrcIdx];
27591 if (isUndefOrZero(M)) {
27592 // Zero/UNDEF insertion - zero out element and remove dependency.
27593 InsertPSMask |= (1u << DstIdx);
27594 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27595 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27597 // Update insertps mask srcidx and reference the source input directly.
27598 assert(0 <= M && M < 8 && "Shuffle index out of range");
27599 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
27600 Op1 = Ops1[M < 4 ? 0 : 1];
27601 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27602 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27605 // Attempt to merge insertps Op0 with an inner target shuffle node.
27606 SmallVector<int, 8> TargetMask0;
27607 SmallVector<SDValue, 2> Ops0;
27608 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
27611 bool Updated = false;
27612 bool UseInput00 = false;
27613 bool UseInput01 = false;
27614 for (int i = 0; i != 4; ++i) {
27615 int M = TargetMask0[i];
27616 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
27617 // No change if element is already zero or the inserted element.
27619 } else if (isUndefOrZero(M)) {
27620 // If the target mask is undef/zero then we must zero the element.
27621 InsertPSMask |= (1u << i);
27626 // The input vector element must be inline.
27627 if (M != i && M != (i + 4))
27630 // Determine which inputs of the target shuffle we're using.
27631 UseInput00 |= (0 <= M && M < 4);
27632 UseInput01 |= (4 <= M);
27635 // If we're not using both inputs of the target shuffle then use the
27636 // referenced input directly.
27637 if (UseInput00 && !UseInput01) {
27640 } else if (!UseInput00 && UseInput01) {
27646 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27647 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27655 // Nuke no-op shuffles that show up after combining.
27656 if (isNoopShuffleMask(Mask))
27657 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27659 // Look for simplifications involving one or two shuffle instructions.
27660 SDValue V = N.getOperand(0);
27661 switch (N.getOpcode()) {
27664 case X86ISD::PSHUFLW:
27665 case X86ISD::PSHUFHW:
27666 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
27668 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
27669 return SDValue(); // We combined away this shuffle, so we're done.
27671 // See if this reduces to a PSHUFD which is no more expensive and can
27672 // combine with more operations. Note that it has to at least flip the
27673 // dwords as otherwise it would have been removed as a no-op.
27674 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
27675 int DMask[] = {0, 1, 2, 3};
27676 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
27677 DMask[DOffset + 0] = DOffset + 1;
27678 DMask[DOffset + 1] = DOffset + 0;
27679 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27680 V = DAG.getBitcast(DVT, V);
27681 DCI.AddToWorklist(V.getNode());
27682 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
27683 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
27684 DCI.AddToWorklist(V.getNode());
27685 return DAG.getBitcast(VT, V);
27688 // Look for shuffle patterns which can be implemented as a single unpack.
27689 // FIXME: This doesn't handle the location of the PSHUFD generically, and
27690 // only works when we have a PSHUFD followed by two half-shuffles.
27691 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
27692 (V.getOpcode() == X86ISD::PSHUFLW ||
27693 V.getOpcode() == X86ISD::PSHUFHW) &&
27694 V.getOpcode() != N.getOpcode() &&
27696 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
27697 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
27698 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27699 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
27700 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27701 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27703 for (int i = 0; i < 4; ++i) {
27704 WordMask[i + NOffset] = Mask[i] + NOffset;
27705 WordMask[i + VOffset] = VMask[i] + VOffset;
27707 // Map the word mask through the DWord mask.
27709 for (int i = 0; i < 8; ++i)
27710 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
27711 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
27712 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
27713 // We can replace all three shuffles with an unpack.
27714 V = DAG.getBitcast(VT, D.getOperand(0));
27715 DCI.AddToWorklist(V.getNode());
27716 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
27725 case X86ISD::PSHUFD:
27726 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
27735 /// \brief Try to combine a shuffle into a target-specific add-sub node.
27737 /// We combine this directly on the abstract vector shuffle nodes so it is
27738 /// easier to generically match. We also insert dummy vector shuffle nodes for
27739 /// the operands which explicitly discard the lanes which are unused by this
27740 /// operation to try to flow through the rest of the combiner the fact that
27741 /// they're unused.
27742 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
27743 SelectionDAG &DAG) {
27745 EVT VT = N->getValueType(0);
27746 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
27747 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
27750 // We only handle target-independent shuffles.
27751 // FIXME: It would be easy and harmless to use the target shuffle mask
27752 // extraction tool to support more.
27753 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
27756 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
27757 SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
27759 SDValue V1 = N->getOperand(0);
27760 SDValue V2 = N->getOperand(1);
27762 // We require the first shuffle operand to be the FSUB node, and the second to
27763 // be the FADD node.
27764 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
27765 ShuffleVectorSDNode::commuteMask(Mask);
27767 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
27770 // If there are other uses of these operations we can't fold them.
27771 if (!V1->hasOneUse() || !V2->hasOneUse())
27774 // Ensure that both operations have the same operands. Note that we can
27775 // commute the FADD operands.
27776 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
27777 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
27778 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
27781 // We're looking for blends between FADD and FSUB nodes. We insist on these
27782 // nodes being lined up in a specific expected pattern.
27783 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
27784 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
27785 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
27788 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
27791 // We are looking for a shuffle where both sources are concatenated with undef
27792 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
27793 // if we can express this as a single-source shuffle, that's preferable.
27794 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
27795 const X86Subtarget &Subtarget) {
27796 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
27799 EVT VT = N->getValueType(0);
27801 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
27802 if (!VT.is128BitVector() && !VT.is256BitVector())
27805 if (VT.getVectorElementType() != MVT::i32 &&
27806 VT.getVectorElementType() != MVT::i64 &&
27807 VT.getVectorElementType() != MVT::f32 &&
27808 VT.getVectorElementType() != MVT::f64)
27811 SDValue N0 = N->getOperand(0);
27812 SDValue N1 = N->getOperand(1);
27814 // Check that both sources are concats with undef.
27815 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
27816 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
27817 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
27818 !N1.getOperand(1).isUndef())
27821 // Construct the new shuffle mask. Elements from the first source retain their
27822 // index, but elements from the second source no longer need to skip an undef.
27823 SmallVector<int, 8> Mask;
27824 int NumElts = VT.getVectorNumElements();
27826 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
27827 for (int Elt : SVOp->getMask())
27828 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
27831 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
27833 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
27836 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
27837 TargetLowering::DAGCombinerInfo &DCI,
27838 const X86Subtarget &Subtarget) {
27840 EVT VT = N->getValueType(0);
27842 // Don't create instructions with illegal types after legalize types has run.
27843 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27844 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
27847 // If we have legalized the vector types, look for blends of FADD and FSUB
27848 // nodes that we can fuse into an ADDSUB node.
27849 if (TLI.isTypeLegal(VT))
27850 if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
27853 // During Type Legalization, when promoting illegal vector types,
27854 // the backend might introduce new shuffle dag nodes and bitcasts.
27856 // This code performs the following transformation:
27857 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
27858 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
27860 // We do this only if both the bitcast and the BINOP dag nodes have
27861 // one use. Also, perform this transformation only if the new binary
27862 // operation is legal. This is to avoid introducing dag nodes that
27863 // potentially need to be further expanded (or custom lowered) into a
27864 // less optimal sequence of dag nodes.
27865 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
27866 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
27867 N->getOperand(0).getOpcode() == ISD::BITCAST &&
27868 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
27869 SDValue N0 = N->getOperand(0);
27870 SDValue N1 = N->getOperand(1);
27872 SDValue BC0 = N0.getOperand(0);
27873 EVT SVT = BC0.getValueType();
27874 unsigned Opcode = BC0.getOpcode();
27875 unsigned NumElts = VT.getVectorNumElements();
27877 if (BC0.hasOneUse() && SVT.isVector() &&
27878 SVT.getVectorNumElements() * 2 == NumElts &&
27879 TLI.isOperationLegal(Opcode, VT)) {
27880 bool CanFold = false;
27886 // isOperationLegal lies for integer ops on floating point types.
27887 CanFold = VT.isInteger();
27892 // isOperationLegal lies for floating point ops on integer types.
27893 CanFold = VT.isFloatingPoint();
27897 unsigned SVTNumElts = SVT.getVectorNumElements();
27898 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
27899 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
27900 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
27901 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
27902 CanFold = SVOp->getMaskElt(i) < 0;
27905 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
27906 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
27907 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
27908 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
27913 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
27914 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
27915 // consecutive, non-overlapping, and in the right order.
27916 SmallVector<SDValue, 16> Elts;
27917 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
27918 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
27920 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
27923 // For AVX2, we sometimes want to combine
27924 // (vector_shuffle <mask> (concat_vectors t1, undef)
27925 // (concat_vectors t2, undef))
27927 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
27928 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
27929 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
27932 if (isTargetShuffle(N->getOpcode())) {
27934 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
27937 // Try recursively combining arbitrary sequences of x86 shuffle
27938 // instructions into higher-order shuffles. We do this after combining
27939 // specific PSHUF instruction sequences into their minimal form so that we
27940 // can evaluate how many specialized shuffle instructions are involved in
27941 // a particular chain.
27942 SmallVector<int, 1> NonceMask; // Just a placeholder.
27943 NonceMask.push_back(0);
27944 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
27945 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
27947 return SDValue(); // This routine will use CombineTo to replace N.
27953 /// Check if a vector extract from a target-specific shuffle of a load can be
27954 /// folded into a single element load.
27955 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
27956 /// shuffles have been custom lowered so we need to handle those here.
27957 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
27958 TargetLowering::DAGCombinerInfo &DCI) {
27959 if (DCI.isBeforeLegalizeOps())
27962 SDValue InVec = N->getOperand(0);
27963 SDValue EltNo = N->getOperand(1);
27964 EVT EltVT = N->getValueType(0);
27966 if (!isa<ConstantSDNode>(EltNo))
27969 EVT OriginalVT = InVec.getValueType();
27971 if (InVec.getOpcode() == ISD::BITCAST) {
27972 // Don't duplicate a load with other uses.
27973 if (!InVec.hasOneUse())
27975 EVT BCVT = InVec.getOperand(0).getValueType();
27976 if (!BCVT.isVector() ||
27977 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
27979 InVec = InVec.getOperand(0);
27982 EVT CurrentVT = InVec.getValueType();
27984 if (!isTargetShuffle(InVec.getOpcode()))
27987 // Don't duplicate a load with other uses.
27988 if (!InVec.hasOneUse())
27991 SmallVector<int, 16> ShuffleMask;
27992 SmallVector<SDValue, 2> ShuffleOps;
27994 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
27995 ShuffleOps, ShuffleMask, UnaryShuffle))
27998 // Select the input vector, guarding against out of range extract vector.
27999 unsigned NumElems = CurrentVT.getVectorNumElements();
28000 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28001 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28003 if (Idx == SM_SentinelZero)
28004 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28005 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28006 if (Idx == SM_SentinelUndef)
28007 return DAG.getUNDEF(EltVT);
28009 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28010 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28013 // If inputs to shuffle are the same for both ops, then allow 2 uses
28014 unsigned AllowedUses =
28015 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28017 if (LdNode.getOpcode() == ISD::BITCAST) {
28018 // Don't duplicate a load with other uses.
28019 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28022 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28023 LdNode = LdNode.getOperand(0);
28026 if (!ISD::isNormalLoad(LdNode.getNode()))
28029 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28031 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28034 // If there's a bitcast before the shuffle, check if the load type and
28035 // alignment is valid.
28036 unsigned Align = LN0->getAlignment();
28037 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28038 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28039 EltVT.getTypeForEVT(*DAG.getContext()));
28041 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28044 // All checks match so transform back to vector_shuffle so that DAG combiner
28045 // can finish the job
28048 // Create shuffle node taking into account the case that its a unary shuffle
28049 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28050 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28052 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28053 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28057 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28058 const X86Subtarget &Subtarget) {
28059 SDValue N0 = N->getOperand(0);
28060 EVT VT = N->getValueType(0);
28062 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
28063 // special and don't usually play with other vector types, it's better to
28064 // handle them early to be sure we emit efficient code by avoiding
28065 // store-load conversions.
28066 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28067 N0.getValueType() == MVT::v2i32 &&
28068 isNullConstant(N0.getOperand(1))) {
28069 SDValue N00 = N0->getOperand(0);
28070 if (N00.getValueType() == MVT::i32)
28071 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28074 // Convert a bitcasted integer logic operation that has one bitcasted
28075 // floating-point operand into a floating-point logic operation. This may
28076 // create a load of a constant, but that is cheaper than materializing the
28077 // constant in an integer register and transferring it to an SSE register or
28078 // transferring the SSE operand to integer register and back.
28080 switch (N0.getOpcode()) {
28081 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28082 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28083 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28084 default: return SDValue();
28087 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
28088 (Subtarget.hasSSE2() && VT == MVT::f64)))
28091 SDValue LogicOp0 = N0.getOperand(0);
28092 SDValue LogicOp1 = N0.getOperand(1);
28095 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
28096 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
28097 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
28098 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
28099 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
28100 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
28102 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
28103 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
28104 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
28105 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
28106 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
28107 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
28113 // Match a binop + shuffle pyramid that represents a horizontal reduction over
28114 // the elements of a vector.
28115 // Returns the vector that is being reduced on, or SDValue() if a reduction
28116 // was not matched.
28117 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
28118 // The pattern must end in an extract from index 0.
28119 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
28120 !isNullConstant(Extract->getOperand(1)))
28124 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
28126 SDValue Op = Extract->getOperand(0);
28127 // At each stage, we're looking for something that looks like:
28128 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
28129 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
28130 // i32 undef, i32 undef, i32 undef, i32 undef>
28131 // %a = binop <8 x i32> %op, %s
28132 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
28133 // we expect something like:
28134 // <4,5,6,7,u,u,u,u>
28135 // <2,3,u,u,u,u,u,u>
28136 // <1,u,u,u,u,u,u,u>
28137 for (unsigned i = 0; i < Stages; ++i) {
28138 if (Op.getOpcode() != BinOp)
28141 ShuffleVectorSDNode *Shuffle =
28142 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
28144 Op = Op.getOperand(1);
28146 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
28147 Op = Op.getOperand(0);
28150 // The first operand of the shuffle should be the same as the other operand
28152 if (!Shuffle || (Shuffle->getOperand(0) != Op))
28155 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
28156 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
28157 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
28164 // Given a select, detect the following pattern:
28165 // 1: %2 = zext <N x i8> %0 to <N x i32>
28166 // 2: %3 = zext <N x i8> %1 to <N x i32>
28167 // 3: %4 = sub nsw <N x i32> %2, %3
28168 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
28169 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
28170 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
28171 // This is useful as it is the input into a SAD pattern.
28172 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
28174 // Check the condition of the select instruction is greater-than.
28175 SDValue SetCC = Select->getOperand(0);
28176 if (SetCC.getOpcode() != ISD::SETCC)
28178 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
28179 if (CC != ISD::SETGT)
28182 SDValue SelectOp1 = Select->getOperand(1);
28183 SDValue SelectOp2 = Select->getOperand(2);
28185 // The second operand of the select should be the negation of the first
28186 // operand, which is implemented as 0 - SelectOp1.
28187 if (!(SelectOp2.getOpcode() == ISD::SUB &&
28188 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
28189 SelectOp2.getOperand(1) == SelectOp1))
28192 // The first operand of SetCC is the first operand of the select, which is the
28193 // difference between the two input vectors.
28194 if (SetCC.getOperand(0) != SelectOp1)
28197 // The second operand of the comparison can be either -1 or 0.
28198 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
28199 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
28202 // The first operand of the select is the difference between the two input
28204 if (SelectOp1.getOpcode() != ISD::SUB)
28207 Op0 = SelectOp1.getOperand(0);
28208 Op1 = SelectOp1.getOperand(1);
28210 // Check if the operands of the sub are zero-extended from vectors of i8.
28211 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
28212 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
28213 Op1.getOpcode() != ISD::ZERO_EXTEND ||
28214 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
28220 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
28222 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
28223 const SDValue &Zext1, const SDLoc &DL) {
28225 // Find the appropriate width for the PSADBW.
28226 EVT InVT = Zext0.getOperand(0).getValueType();
28227 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
28229 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
28230 // fill in the missing vector elements with 0.
28231 unsigned NumConcat = RegSize / InVT.getSizeInBits();
28232 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
28233 Ops[0] = Zext0.getOperand(0);
28234 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
28235 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28236 Ops[0] = Zext1.getOperand(0);
28237 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28239 // Actually build the SAD
28240 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
28241 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
28244 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
28245 const X86Subtarget &Subtarget) {
28246 // PSADBW is only supported on SSE2 and up.
28247 if (!Subtarget.hasSSE2())
28250 // Verify the type we're extracting from is appropriate
28251 // TODO: There's nothing special about i32, any integer type above i16 should
28252 // work just as well.
28253 EVT VT = Extract->getOperand(0).getValueType();
28254 if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
28257 unsigned RegSize = 128;
28258 if (Subtarget.hasBWI())
28260 else if (Subtarget.hasAVX2())
28263 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
28264 // TODO: We should be able to handle larger vectors by splitting them before
28265 // feeding them into several SADs, and then reducing over those.
28266 if (VT.getSizeInBits() / 4 > RegSize)
28269 // Match shuffle + add pyramid.
28270 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
28272 // If there was a match, we want Root to be a select that is the root of an
28273 // abs-diff pattern.
28274 if (!Root || (Root.getOpcode() != ISD::VSELECT))
28277 // Check whether we have an abs-diff pattern feeding into the select.
28278 SDValue Zext0, Zext1;
28279 if (!detectZextAbsDiff(Root, Zext0, Zext1))
28282 // Create the SAD instruction
28284 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
28286 // If the original vector was wider than 8 elements, sum over the results
28287 // in the SAD vector.
28288 unsigned Stages = Log2_32(VT.getVectorNumElements());
28289 MVT SadVT = SAD.getSimpleValueType();
28291 unsigned SadElems = SadVT.getVectorNumElements();
28293 for(unsigned i = Stages - 3; i > 0; --i) {
28294 SmallVector<int, 16> Mask(SadElems, -1);
28295 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
28296 Mask[j] = MaskEnd + j;
28299 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
28300 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
28304 // Return the lowest i32.
28305 MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
28306 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
28307 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
28308 Extract->getOperand(1));
28311 /// Detect vector gather/scatter index generation and convert it from being a
28312 /// bunch of shuffles and extracts into a somewhat faster sequence.
28313 /// For i686, the best sequence is apparently storing the value and loading
28314 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
28315 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
28316 TargetLowering::DAGCombinerInfo &DCI,
28317 const X86Subtarget &Subtarget) {
28318 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
28321 SDValue InputVector = N->getOperand(0);
28322 SDLoc dl(InputVector);
28323 // Detect mmx to i32 conversion through a v2i32 elt extract.
28324 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
28325 N->getValueType(0) == MVT::i32 &&
28326 InputVector.getValueType() == MVT::v2i32 &&
28327 isa<ConstantSDNode>(N->getOperand(1)) &&
28328 N->getConstantOperandVal(1) == 0) {
28329 SDValue MMXSrc = InputVector.getOperand(0);
28331 // The bitcast source is a direct mmx result.
28332 if (MMXSrc.getValueType() == MVT::x86mmx)
28333 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
28336 EVT VT = N->getValueType(0);
28338 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
28339 InputVector.getOpcode() == ISD::BITCAST &&
28340 isa<ConstantSDNode>(InputVector.getOperand(0))) {
28341 uint64_t ExtractedElt =
28342 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
28343 uint64_t InputValue =
28344 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
28345 uint64_t Res = (InputValue >> ExtractedElt) & 1;
28346 return DAG.getConstant(Res, dl, MVT::i1);
28349 // Check whether this extract is the root of a sum of absolute differences
28350 // pattern. This has to be done here because we really want it to happen
28351 // pre-legalization,
28352 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
28355 // Only operate on vectors of 4 elements, where the alternative shuffling
28356 // gets to be more expensive.
28357 if (InputVector.getValueType() != MVT::v4i32)
28360 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
28361 // single use which is a sign-extend or zero-extend, and all elements are
28363 SmallVector<SDNode *, 4> Uses;
28364 unsigned ExtractedElements = 0;
28365 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
28366 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
28367 if (UI.getUse().getResNo() != InputVector.getResNo())
28370 SDNode *Extract = *UI;
28371 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
28374 if (Extract->getValueType(0) != MVT::i32)
28376 if (!Extract->hasOneUse())
28378 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
28379 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
28381 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
28384 // Record which element was extracted.
28385 ExtractedElements |=
28386 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
28388 Uses.push_back(Extract);
28391 // If not all the elements were used, this may not be worthwhile.
28392 if (ExtractedElements != 15)
28395 // Ok, we've now decided to do the transformation.
28396 // If 64-bit shifts are legal, use the extract-shift sequence,
28397 // otherwise bounce the vector off the cache.
28398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28401 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
28402 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
28403 auto &DL = DAG.getDataLayout();
28404 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
28405 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28406 DAG.getConstant(0, dl, VecIdxTy));
28407 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28408 DAG.getConstant(1, dl, VecIdxTy));
28410 SDValue ShAmt = DAG.getConstant(
28411 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
28412 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
28413 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28414 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
28415 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
28416 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28417 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
28419 // Store the value to a temporary stack slot.
28420 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
28421 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
28422 MachinePointerInfo());
28424 EVT ElementType = InputVector.getValueType().getVectorElementType();
28425 unsigned EltSize = ElementType.getSizeInBits() / 8;
28427 // Replace each use (extract) with a load of the appropriate element.
28428 for (unsigned i = 0; i < 4; ++i) {
28429 uint64_t Offset = EltSize * i;
28430 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28431 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
28433 SDValue ScalarAddr =
28434 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
28436 // Load the scalar.
28438 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
28442 // Replace the extracts
28443 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
28444 UE = Uses.end(); UI != UE; ++UI) {
28445 SDNode *Extract = *UI;
28447 SDValue Idx = Extract->getOperand(1);
28448 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
28449 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
28452 // The replacement was made in place; don't return anything.
28456 /// If a vector select has an operand that is -1 or 0, simplify the select to a
28457 /// bitwise logic operation.
28458 static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
28459 const X86Subtarget &Subtarget) {
28460 SDValue Cond = N->getOperand(0);
28461 SDValue LHS = N->getOperand(1);
28462 SDValue RHS = N->getOperand(2);
28463 EVT VT = LHS.getValueType();
28464 EVT CondVT = Cond.getValueType();
28466 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28468 if (N->getOpcode() != ISD::VSELECT)
28471 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28472 // Check if the first operand is all zeros.This situation only
28473 // applies to avx512.
28474 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
28475 //Invert the cond to not(cond) : xor(op,allones)=not(op)
28476 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28477 DAG.getConstant(1, DL, Cond.getValueType()));
28478 //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
28479 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
28481 assert(CondVT.isVector() && "Vector select expects a vector selector!");
28483 // To use the condition operand as a bitwise mask, it must have elements that
28484 // are the same size as the select elements. Ie, the condition operand must
28485 // have already been promoted from the IR select condition type <N x i1>.
28486 // Don't check if the types themselves are equal because that excludes
28487 // vector floating-point selects.
28488 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
28491 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
28492 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
28494 // Try to invert the condition if true value is not all 1s and false value is
28496 if (!TValIsAllOnes && !FValIsAllZeros &&
28497 // Check if the selector will be produced by CMPP*/PCMP*.
28498 Cond.getOpcode() == ISD::SETCC &&
28499 // Check if SETCC has already been promoted.
28500 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
28502 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28503 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
28505 if (TValIsAllZeros || FValIsAllOnes) {
28506 SDValue CC = Cond.getOperand(2);
28507 ISD::CondCode NewCC =
28508 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
28509 Cond.getOperand(0).getValueType().isInteger());
28510 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
28512 std::swap(LHS, RHS);
28513 TValIsAllOnes = FValIsAllOnes;
28514 FValIsAllZeros = TValIsAllZeros;
28518 if (!TValIsAllOnes && !FValIsAllZeros)
28522 if (TValIsAllOnes && FValIsAllZeros)
28524 else if (TValIsAllOnes)
28525 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
28526 else if (FValIsAllZeros)
28527 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS));
28529 return DAG.getBitcast(VT, Ret);
28532 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
28533 SDValue Cond = N->getOperand(0);
28534 SDValue LHS = N->getOperand(1);
28535 SDValue RHS = N->getOperand(2);
28538 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
28539 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
28540 if (!TrueC || !FalseC)
28543 // Don't do this for crazy integer types.
28544 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
28547 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
28548 // so that TrueC (the true value) is larger than FalseC.
28549 bool NeedsCondInvert = false;
28550 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
28551 // Efficiently invertible.
28552 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
28553 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
28554 isa<ConstantSDNode>(Cond.getOperand(1))))) {
28555 NeedsCondInvert = true;
28556 std::swap(TrueC, FalseC);
28559 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
28560 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
28561 if (NeedsCondInvert) // Invert the condition if needed.
28562 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28563 DAG.getConstant(1, DL, Cond.getValueType()));
28565 // Zero extend the condition if needed.
28566 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
28568 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
28569 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
28570 DAG.getConstant(ShAmt, DL, MVT::i8));
28573 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
28574 if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
28575 if (NeedsCondInvert) // Invert the condition if needed.
28576 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28577 DAG.getConstant(1, DL, Cond.getValueType()));
28579 // Zero extend the condition if needed.
28580 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28581 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28582 SDValue(FalseC, 0));
28585 // Optimize cases that will turn into an LEA instruction. This requires
28586 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
28587 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
28588 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
28589 if (N->getValueType(0) == MVT::i32)
28590 Diff = (unsigned)Diff;
28592 bool isFastMultiplier = false;
28594 switch ((unsigned char)Diff) {
28597 case 1: // result = add base, cond
28598 case 2: // result = lea base( , cond*2)
28599 case 3: // result = lea base(cond, cond*2)
28600 case 4: // result = lea base( , cond*4)
28601 case 5: // result = lea base(cond, cond*4)
28602 case 8: // result = lea base( , cond*8)
28603 case 9: // result = lea base(cond, cond*8)
28604 isFastMultiplier = true;
28609 if (isFastMultiplier) {
28610 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
28611 if (NeedsCondInvert) // Invert the condition if needed.
28612 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28613 DAG.getConstant(1, DL, Cond.getValueType()));
28615 // Zero extend the condition if needed.
28616 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28617 // Scale the condition by the difference.
28619 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
28620 DAG.getConstant(Diff, DL, Cond.getValueType()));
28622 // Add the base if non-zero.
28623 if (FalseC->getAPIntValue() != 0)
28624 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28625 SDValue(FalseC, 0));
28633 // If this is a bitcasted op that can be represented as another type, push the
28634 // the bitcast to the inputs. This allows more opportunities for pattern
28635 // matching masked instructions. This is called when we know that the operation
28636 // is used as one of the inputs of a vselect.
28637 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
28638 TargetLowering::DAGCombinerInfo &DCI) {
28639 // Make sure we have a bitcast.
28640 if (OrigOp.getOpcode() != ISD::BITCAST)
28643 SDValue Op = OrigOp.getOperand(0);
28645 // If the operation is used by anything other than the bitcast, we shouldn't
28646 // do this combine as that would replicate the operation.
28647 if (!Op.hasOneUse())
28650 MVT VT = OrigOp.getSimpleValueType();
28651 MVT EltVT = VT.getVectorElementType();
28652 SDLoc DL(Op.getNode());
28654 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
28656 Op0 = DAG.getBitcast(VT, Op0);
28657 DCI.AddToWorklist(Op0.getNode());
28658 Op1 = DAG.getBitcast(VT, Op1);
28659 DCI.AddToWorklist(Op1.getNode());
28660 DCI.CombineTo(OrigOp.getNode(),
28661 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
28665 unsigned Opcode = Op.getOpcode();
28667 case X86ISD::PALIGNR:
28668 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
28669 if (!VT.is128BitVector())
28671 Opcode = X86ISD::VALIGN;
28673 case X86ISD::VALIGN: {
28674 if (EltVT != MVT::i32 && EltVT != MVT::i64)
28676 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
28677 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28678 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
28679 unsigned EltSize = EltVT.getSizeInBits();
28680 // Make sure we can represent the same shift with the new VT.
28681 if ((ShiftAmt % EltSize) != 0)
28683 Imm = ShiftAmt / EltSize;
28684 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28685 DAG.getConstant(Imm, DL, MVT::i8));
28687 case X86ISD::SHUF128: {
28688 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
28690 // Only change element size, not type.
28691 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
28693 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28701 /// Do target-specific dag combines on SELECT and VSELECT nodes.
28702 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
28703 TargetLowering::DAGCombinerInfo &DCI,
28704 const X86Subtarget &Subtarget) {
28706 SDValue Cond = N->getOperand(0);
28707 // Get the LHS/RHS of the select.
28708 SDValue LHS = N->getOperand(1);
28709 SDValue RHS = N->getOperand(2);
28710 EVT VT = LHS.getValueType();
28711 EVT CondVT = Cond.getValueType();
28712 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28714 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
28715 // instructions match the semantics of the common C idiom x<y?x:y but not
28716 // x<=y?x:y, because of how they handle negative zero (which can be
28717 // ignored in unsafe-math mode).
28718 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
28719 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
28720 VT != MVT::f80 && VT != MVT::f128 &&
28721 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
28722 (Subtarget.hasSSE2() ||
28723 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
28724 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28726 unsigned Opcode = 0;
28727 // Check for x CC y ? x : y.
28728 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
28729 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
28733 // Converting this to a min would handle NaNs incorrectly, and swapping
28734 // the operands would cause it to handle comparisons between positive
28735 // and negative zero incorrectly.
28736 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28737 if (!DAG.getTarget().Options.UnsafeFPMath &&
28738 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28740 std::swap(LHS, RHS);
28742 Opcode = X86ISD::FMIN;
28745 // Converting this to a min would handle comparisons between positive
28746 // and negative zero incorrectly.
28747 if (!DAG.getTarget().Options.UnsafeFPMath &&
28748 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28750 Opcode = X86ISD::FMIN;
28753 // Converting this to a min would handle both negative zeros and NaNs
28754 // incorrectly, but we can swap the operands to fix both.
28755 std::swap(LHS, RHS);
28759 Opcode = X86ISD::FMIN;
28763 // Converting this to a max would handle comparisons between positive
28764 // and negative zero incorrectly.
28765 if (!DAG.getTarget().Options.UnsafeFPMath &&
28766 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28768 Opcode = X86ISD::FMAX;
28771 // Converting this to a max would handle NaNs incorrectly, and swapping
28772 // the operands would cause it to handle comparisons between positive
28773 // and negative zero incorrectly.
28774 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28775 if (!DAG.getTarget().Options.UnsafeFPMath &&
28776 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28778 std::swap(LHS, RHS);
28780 Opcode = X86ISD::FMAX;
28783 // Converting this to a max would handle both negative zeros and NaNs
28784 // incorrectly, but we can swap the operands to fix both.
28785 std::swap(LHS, RHS);
28789 Opcode = X86ISD::FMAX;
28792 // Check for x CC y ? y : x -- a min/max with reversed arms.
28793 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
28794 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
28798 // Converting this to a min would handle comparisons between positive
28799 // and negative zero incorrectly, and swapping the operands would
28800 // cause it to handle NaNs incorrectly.
28801 if (!DAG.getTarget().Options.UnsafeFPMath &&
28802 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
28803 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28805 std::swap(LHS, RHS);
28807 Opcode = X86ISD::FMIN;
28810 // Converting this to a min would handle NaNs incorrectly.
28811 if (!DAG.getTarget().Options.UnsafeFPMath &&
28812 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
28814 Opcode = X86ISD::FMIN;
28817 // Converting this to a min would handle both negative zeros and NaNs
28818 // incorrectly, but we can swap the operands to fix both.
28819 std::swap(LHS, RHS);
28823 Opcode = X86ISD::FMIN;
28827 // Converting this to a max would handle NaNs incorrectly.
28828 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28830 Opcode = X86ISD::FMAX;
28833 // Converting this to a max would handle comparisons between positive
28834 // and negative zero incorrectly, and swapping the operands would
28835 // cause it to handle NaNs incorrectly.
28836 if (!DAG.getTarget().Options.UnsafeFPMath &&
28837 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
28838 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28840 std::swap(LHS, RHS);
28842 Opcode = X86ISD::FMAX;
28845 // Converting this to a max would handle both negative zeros and NaNs
28846 // incorrectly, but we can swap the operands to fix both.
28847 std::swap(LHS, RHS);
28851 Opcode = X86ISD::FMAX;
28857 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
28860 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
28861 // lowering on KNL. In this case we convert it to
28862 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
28863 // The same situation for all 128 and 256-bit vectors of i8 and i16.
28864 // Since SKX these selects have a proper lowering.
28865 if (Subtarget.hasAVX512() && CondVT.isVector() &&
28866 CondVT.getVectorElementType() == MVT::i1 &&
28867 (VT.is128BitVector() || VT.is256BitVector()) &&
28868 (VT.getVectorElementType() == MVT::i8 ||
28869 VT.getVectorElementType() == MVT::i16) &&
28870 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
28871 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
28872 DCI.AddToWorklist(Cond.getNode());
28873 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
28876 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
28879 // Canonicalize max and min:
28880 // (x > y) ? x : y -> (x >= y) ? x : y
28881 // (x < y) ? x : y -> (x <= y) ? x : y
28882 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
28883 // the need for an extra compare
28884 // against zero. e.g.
28885 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
28887 // testl %edi, %edi
28889 // cmovgl %edi, %eax
28893 // cmovsl %eax, %edi
28894 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
28895 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
28896 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
28897 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28902 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
28903 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
28904 Cond.getOperand(0), Cond.getOperand(1), NewCC);
28905 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
28910 // Early exit check
28911 if (!TLI.isTypeLegal(VT))
28914 // Match VSELECTs into subs with unsigned saturation.
28915 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
28916 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
28917 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
28918 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
28919 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28921 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
28922 // left side invert the predicate to simplify logic below.
28924 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
28926 CC = ISD::getSetCCInverse(CC, true);
28927 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
28931 if (Other.getNode() && Other->getNumOperands() == 2 &&
28932 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
28933 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
28934 SDValue CondRHS = Cond->getOperand(1);
28936 // Look for a general sub with unsigned saturation first.
28937 // x >= y ? x-y : 0 --> subus x, y
28938 // x > y ? x-y : 0 --> subus x, y
28939 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
28940 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
28941 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
28943 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
28944 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
28945 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
28946 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
28947 // If the RHS is a constant we have to reverse the const
28948 // canonicalization.
28949 // x > C-1 ? x+-C : 0 --> subus x, C
28950 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
28951 CondRHSConst->getAPIntValue() ==
28952 (-OpRHSConst->getAPIntValue() - 1))
28953 return DAG.getNode(
28954 X86ISD::SUBUS, DL, VT, OpLHS,
28955 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
28957 // Another special case: If C was a sign bit, the sub has been
28958 // canonicalized into a xor.
28959 // FIXME: Would it be better to use computeKnownBits to determine
28960 // whether it's safe to decanonicalize the xor?
28961 // x s< 0 ? x^C : 0 --> subus x, C
28962 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
28963 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
28964 OpRHSConst->getAPIntValue().isSignBit())
28965 // Note that we have to rebuild the RHS constant here to ensure we
28966 // don't rely on particular values of undef lanes.
28967 return DAG.getNode(
28968 X86ISD::SUBUS, DL, VT, OpLHS,
28969 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
28974 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, Subtarget))
28977 // If this is a *dynamic* select (non-constant condition) and we can match
28978 // this node with one of the variable blend instructions, restructure the
28979 // condition so that the blends can use the high bit of each element and use
28980 // SimplifyDemandedBits to simplify the condition operand.
28981 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
28982 !DCI.isBeforeLegalize() &&
28983 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
28984 unsigned BitWidth = Cond.getScalarValueSizeInBits();
28986 // Don't optimize vector selects that map to mask-registers.
28990 // We can only handle the cases where VSELECT is directly legal on the
28991 // subtarget. We custom lower VSELECT nodes with constant conditions and
28992 // this makes it hard to see whether a dynamic VSELECT will correctly
28993 // lower, so we both check the operation's status and explicitly handle the
28994 // cases where a *dynamic* blend will fail even though a constant-condition
28995 // blend could be custom lowered.
28996 // FIXME: We should find a better way to handle this class of problems.
28997 // Potentially, we should combine constant-condition vselect nodes
28998 // pre-legalization into shuffles and not mark as many types as custom
29000 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
29002 // FIXME: We don't support i16-element blends currently. We could and
29003 // should support them by making *all* the bits in the condition be set
29004 // rather than just the high bit and using an i8-element blend.
29005 if (VT.getVectorElementType() == MVT::i16)
29007 // Dynamic blending was only available from SSE4.1 onward.
29008 if (VT.is128BitVector() && !Subtarget.hasSSE41())
29010 // Byte blends are only available in AVX2
29011 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
29014 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
29015 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
29017 APInt KnownZero, KnownOne;
29018 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
29019 DCI.isBeforeLegalizeOps());
29020 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
29021 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
29023 // If we changed the computation somewhere in the DAG, this change
29024 // will affect all users of Cond.
29025 // Make sure it is fine and update all the nodes so that we do not
29026 // use the generic VSELECT anymore. Otherwise, we may perform
29027 // wrong optimizations as we messed up with the actual expectation
29028 // for the vector boolean values.
29029 if (Cond != TLO.Old) {
29030 // Check all uses of that condition operand to check whether it will be
29031 // consumed by non-BLEND instructions, which may depend on all bits are
29033 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29035 if (I->getOpcode() != ISD::VSELECT)
29036 // TODO: Add other opcodes eventually lowered into BLEND.
29039 // Update all the users of the condition, before committing the change,
29040 // so that the VSELECT optimizations that expect the correct vector
29041 // boolean value will not be triggered.
29042 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29044 DAG.ReplaceAllUsesOfValueWith(
29046 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
29047 Cond, I->getOperand(1), I->getOperand(2)));
29048 DCI.CommitTargetLoweringOpt(TLO);
29051 // At this point, only Cond is changed. Change the condition
29052 // just for N to keep the opportunity to optimize all other
29053 // users their own way.
29054 DAG.ReplaceAllUsesOfValueWith(
29056 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
29057 TLO.New, N->getOperand(1), N->getOperand(2)));
29062 // Look for vselects with LHS/RHS being bitcasted from an operation that
29063 // can be executed on another type. Push the bitcast to the inputs of
29064 // the operation. This exposes opportunities for using masking instructions.
29065 if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
29066 CondVT.getVectorElementType() == MVT::i1) {
29067 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
29068 return SDValue(N, 0);
29069 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
29070 return SDValue(N, 0);
29077 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
29079 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
29080 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
29081 /// Note that this is only legal for some op/cc combinations.
29082 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
29083 SelectionDAG &DAG) {
29084 // This combine only operates on CMP-like nodes.
29085 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29086 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29089 // This only applies to variations of the common case:
29090 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
29091 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
29092 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
29093 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
29094 // Using the proper condcodes (see below), overflow is checked for.
29096 // FIXME: We can generalize both constraints:
29097 // - XOR/OR/AND (if they were made to survive AtomicExpand)
29099 // if the result is compared.
29101 SDValue CmpLHS = Cmp.getOperand(0);
29102 SDValue CmpRHS = Cmp.getOperand(1);
29104 if (!CmpLHS.hasOneUse())
29107 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
29108 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
29111 const unsigned Opc = CmpLHS.getOpcode();
29113 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
29116 SDValue OpRHS = CmpLHS.getOperand(2);
29117 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
29121 APInt Addend = OpRHSC->getAPIntValue();
29122 if (Opc == ISD::ATOMIC_LOAD_SUB)
29125 if (CC == X86::COND_S && Addend == 1)
29127 else if (CC == X86::COND_NS && Addend == 1)
29129 else if (CC == X86::COND_G && Addend == -1)
29131 else if (CC == X86::COND_LE && Addend == -1)
29136 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
29137 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
29138 DAG.getUNDEF(CmpLHS.getValueType()));
29139 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
29143 // Check whether a boolean test is testing a boolean value generated by
29144 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
29147 // Simplify the following patterns:
29148 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
29149 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
29150 // to (Op EFLAGS Cond)
29152 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
29153 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
29154 // to (Op EFLAGS !Cond)
29156 // where Op could be BRCOND or CMOV.
29158 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
29159 // This combine only operates on CMP-like nodes.
29160 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29161 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29164 // Quit if not used as a boolean value.
29165 if (CC != X86::COND_E && CC != X86::COND_NE)
29168 // Check CMP operands. One of them should be 0 or 1 and the other should be
29169 // an SetCC or extended from it.
29170 SDValue Op1 = Cmp.getOperand(0);
29171 SDValue Op2 = Cmp.getOperand(1);
29174 const ConstantSDNode* C = nullptr;
29175 bool needOppositeCond = (CC == X86::COND_E);
29176 bool checkAgainstTrue = false; // Is it a comparison against 1?
29178 if ((C = dyn_cast<ConstantSDNode>(Op1)))
29180 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
29182 else // Quit if all operands are not constants.
29185 if (C->getZExtValue() == 1) {
29186 needOppositeCond = !needOppositeCond;
29187 checkAgainstTrue = true;
29188 } else if (C->getZExtValue() != 0)
29189 // Quit if the constant is neither 0 or 1.
29192 bool truncatedToBoolWithAnd = false;
29193 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
29194 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
29195 SetCC.getOpcode() == ISD::TRUNCATE ||
29196 SetCC.getOpcode() == ISD::AND) {
29197 if (SetCC.getOpcode() == ISD::AND) {
29199 if (isOneConstant(SetCC.getOperand(0)))
29201 if (isOneConstant(SetCC.getOperand(1)))
29205 SetCC = SetCC.getOperand(OpIdx);
29206 truncatedToBoolWithAnd = true;
29208 SetCC = SetCC.getOperand(0);
29211 switch (SetCC.getOpcode()) {
29212 case X86ISD::SETCC_CARRY:
29213 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
29214 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
29215 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
29216 // truncated to i1 using 'and'.
29217 if (checkAgainstTrue && !truncatedToBoolWithAnd)
29219 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
29220 "Invalid use of SETCC_CARRY!");
29222 case X86ISD::SETCC:
29223 // Set the condition code or opposite one if necessary.
29224 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
29225 if (needOppositeCond)
29226 CC = X86::GetOppositeBranchCondition(CC);
29227 return SetCC.getOperand(1);
29228 case X86ISD::CMOV: {
29229 // Check whether false/true value has canonical one, i.e. 0 or 1.
29230 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
29231 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
29232 // Quit if true value is not a constant.
29235 // Quit if false value is not a constant.
29237 SDValue Op = SetCC.getOperand(0);
29238 // Skip 'zext' or 'trunc' node.
29239 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
29240 Op.getOpcode() == ISD::TRUNCATE)
29241 Op = Op.getOperand(0);
29242 // A special case for rdrand/rdseed, where 0 is set if false cond is
29244 if ((Op.getOpcode() != X86ISD::RDRAND &&
29245 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
29248 // Quit if false value is not the constant 0 or 1.
29249 bool FValIsFalse = true;
29250 if (FVal && FVal->getZExtValue() != 0) {
29251 if (FVal->getZExtValue() != 1)
29253 // If FVal is 1, opposite cond is needed.
29254 needOppositeCond = !needOppositeCond;
29255 FValIsFalse = false;
29257 // Quit if TVal is not the constant opposite of FVal.
29258 if (FValIsFalse && TVal->getZExtValue() != 1)
29260 if (!FValIsFalse && TVal->getZExtValue() != 0)
29262 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
29263 if (needOppositeCond)
29264 CC = X86::GetOppositeBranchCondition(CC);
29265 return SetCC.getOperand(3);
29272 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
29274 /// (X86or (X86setcc) (X86setcc))
29275 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
29276 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
29277 X86::CondCode &CC1, SDValue &Flags,
29279 if (Cond->getOpcode() == X86ISD::CMP) {
29280 if (!isNullConstant(Cond->getOperand(1)))
29283 Cond = Cond->getOperand(0);
29288 SDValue SetCC0, SetCC1;
29289 switch (Cond->getOpcode()) {
29290 default: return false;
29297 SetCC0 = Cond->getOperand(0);
29298 SetCC1 = Cond->getOperand(1);
29302 // Make sure we have SETCC nodes, using the same flags value.
29303 if (SetCC0.getOpcode() != X86ISD::SETCC ||
29304 SetCC1.getOpcode() != X86ISD::SETCC ||
29305 SetCC0->getOperand(1) != SetCC1->getOperand(1))
29308 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
29309 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
29310 Flags = SetCC0->getOperand(1);
29314 /// Optimize an EFLAGS definition used according to the condition code \p CC
29315 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
29316 /// uses of chain values.
29317 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
29318 SelectionDAG &DAG) {
29319 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
29321 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
29324 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
29325 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
29326 TargetLowering::DAGCombinerInfo &DCI,
29327 const X86Subtarget &Subtarget) {
29330 // If the flag operand isn't dead, don't touch this CMOV.
29331 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
29334 SDValue FalseOp = N->getOperand(0);
29335 SDValue TrueOp = N->getOperand(1);
29336 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
29337 SDValue Cond = N->getOperand(3);
29339 if (CC == X86::COND_E || CC == X86::COND_NE) {
29340 switch (Cond.getOpcode()) {
29344 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
29345 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
29346 return (CC == X86::COND_E) ? FalseOp : TrueOp;
29350 // Try to simplify the EFLAGS and condition code operands.
29351 // We can't always do this as FCMOV only supports a subset of X86 cond.
29352 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
29353 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
29354 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
29356 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29360 // If this is a select between two integer constants, try to do some
29361 // optimizations. Note that the operands are ordered the opposite of SELECT
29363 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
29364 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
29365 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
29366 // larger than FalseC (the false value).
29367 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
29368 CC = X86::GetOppositeBranchCondition(CC);
29369 std::swap(TrueC, FalseC);
29370 std::swap(TrueOp, FalseOp);
29373 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
29374 // This is efficient for any integer data type (including i8/i16) and
29376 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29377 Cond = getSETCC(CC, Cond, DL, DAG);
29379 // Zero extend the condition if needed.
29380 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
29382 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29383 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
29384 DAG.getConstant(ShAmt, DL, MVT::i8));
29385 if (N->getNumValues() == 2) // Dead flag value?
29386 return DCI.CombineTo(N, Cond, SDValue());
29390 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
29391 // for any integer data type, including i8/i16.
29392 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
29393 Cond = getSETCC(CC, Cond, DL, DAG);
29395 // Zero extend the condition if needed.
29396 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
29397 FalseC->getValueType(0), Cond);
29398 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29399 SDValue(FalseC, 0));
29401 if (N->getNumValues() == 2) // Dead flag value?
29402 return DCI.CombineTo(N, Cond, SDValue());
29406 // Optimize cases that will turn into an LEA instruction. This requires
29407 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29408 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29409 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
29410 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
29412 bool isFastMultiplier = false;
29414 switch ((unsigned char)Diff) {
29416 case 1: // result = add base, cond
29417 case 2: // result = lea base( , cond*2)
29418 case 3: // result = lea base(cond, cond*2)
29419 case 4: // result = lea base( , cond*4)
29420 case 5: // result = lea base(cond, cond*4)
29421 case 8: // result = lea base( , cond*8)
29422 case 9: // result = lea base(cond, cond*8)
29423 isFastMultiplier = true;
29428 if (isFastMultiplier) {
29429 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
29430 Cond = getSETCC(CC, Cond, DL ,DAG);
29431 // Zero extend the condition if needed.
29432 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
29434 // Scale the condition by the difference.
29436 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29437 DAG.getConstant(Diff, DL, Cond.getValueType()));
29439 // Add the base if non-zero.
29440 if (FalseC->getAPIntValue() != 0)
29441 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29442 SDValue(FalseC, 0));
29443 if (N->getNumValues() == 2) // Dead flag value?
29444 return DCI.CombineTo(N, Cond, SDValue());
29451 // Handle these cases:
29452 // (select (x != c), e, c) -> select (x != c), e, x),
29453 // (select (x == c), c, e) -> select (x == c), x, e)
29454 // where the c is an integer constant, and the "select" is the combination
29455 // of CMOV and CMP.
29457 // The rationale for this change is that the conditional-move from a constant
29458 // needs two instructions, however, conditional-move from a register needs
29459 // only one instruction.
29461 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
29462 // some instruction-combining opportunities. This opt needs to be
29463 // postponed as late as possible.
29465 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
29466 // the DCI.xxxx conditions are provided to postpone the optimization as
29467 // late as possible.
29469 ConstantSDNode *CmpAgainst = nullptr;
29470 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
29471 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
29472 !isa<ConstantSDNode>(Cond.getOperand(0))) {
29474 if (CC == X86::COND_NE &&
29475 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
29476 CC = X86::GetOppositeBranchCondition(CC);
29477 std::swap(TrueOp, FalseOp);
29480 if (CC == X86::COND_E &&
29481 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
29482 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
29483 DAG.getConstant(CC, DL, MVT::i8), Cond };
29484 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
29489 // Fold and/or of setcc's to double CMOV:
29490 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
29491 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
29493 // This combine lets us generate:
29494 // cmovcc1 (jcc1 if we don't have CMOV)
29500 // cmovne (jne if we don't have CMOV)
29501 // When we can't use the CMOV instruction, it might increase branch
29503 // When we can use CMOV, or when there is no mispredict, this improves
29504 // throughput and reduces register pressure.
29506 if (CC == X86::COND_NE) {
29508 X86::CondCode CC0, CC1;
29510 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
29512 std::swap(FalseOp, TrueOp);
29513 CC0 = X86::GetOppositeBranchCondition(CC0);
29514 CC1 = X86::GetOppositeBranchCondition(CC1);
29517 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
29519 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
29520 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
29521 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29522 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
29530 /// Different mul shrinking modes.
29531 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
29533 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
29534 EVT VT = N->getOperand(0).getValueType();
29535 if (VT.getScalarSizeInBits() != 32)
29538 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
29539 unsigned SignBits[2] = {1, 1};
29540 bool IsPositive[2] = {false, false};
29541 for (unsigned i = 0; i < 2; i++) {
29542 SDValue Opd = N->getOperand(i);
29544 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
29545 // compute signbits for it separately.
29546 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
29547 // For anyextend, it is safe to assume an appropriate number of leading
29549 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
29551 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
29556 IsPositive[i] = true;
29557 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
29558 // All the operands of BUILD_VECTOR need to be int constant.
29559 // Find the smallest value range which all the operands belong to.
29561 IsPositive[i] = true;
29562 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
29563 if (SubOp.isUndef())
29565 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
29568 APInt IntVal = CN->getAPIntValue();
29569 if (IntVal.isNegative())
29570 IsPositive[i] = false;
29571 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
29574 SignBits[i] = DAG.ComputeNumSignBits(Opd);
29575 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
29576 IsPositive[i] = true;
29580 bool AllPositive = IsPositive[0] && IsPositive[1];
29581 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
29582 // When ranges are from -128 ~ 127, use MULS8 mode.
29583 if (MinSignBits >= 25)
29585 // When ranges are from 0 ~ 255, use MULU8 mode.
29586 else if (AllPositive && MinSignBits >= 24)
29588 // When ranges are from -32768 ~ 32767, use MULS16 mode.
29589 else if (MinSignBits >= 17)
29591 // When ranges are from 0 ~ 65535, use MULU16 mode.
29592 else if (AllPositive && MinSignBits >= 16)
29599 /// When the operands of vector mul are extended from smaller size values,
29600 /// like i8 and i16, the type of mul may be shrinked to generate more
29601 /// efficient code. Two typical patterns are handled:
29603 /// %2 = sext/zext <N x i8> %1 to <N x i32>
29604 /// %4 = sext/zext <N x i8> %3 to <N x i32>
29605 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29606 /// %5 = mul <N x i32> %2, %4
29609 /// %2 = zext/sext <N x i16> %1 to <N x i32>
29610 /// %4 = zext/sext <N x i16> %3 to <N x i32>
29611 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29612 /// %5 = mul <N x i32> %2, %4
29614 /// There are four mul shrinking modes:
29615 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
29616 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
29617 /// generate pmullw+sext32 for it (MULS8 mode).
29618 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
29619 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
29620 /// generate pmullw+zext32 for it (MULU8 mode).
29621 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
29622 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
29623 /// generate pmullw+pmulhw for it (MULS16 mode).
29624 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
29625 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
29626 /// generate pmullw+pmulhuw for it (MULU16 mode).
29627 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
29628 const X86Subtarget &Subtarget) {
29629 // Check for legality
29630 // pmullw/pmulhw are not supported by SSE.
29631 if (!Subtarget.hasSSE2())
29634 // Check for profitability
29635 // pmulld is supported since SSE41. It is better to use pmulld
29636 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
29638 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
29639 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
29643 if (!canReduceVMulWidth(N, DAG, Mode))
29647 SDValue N0 = N->getOperand(0);
29648 SDValue N1 = N->getOperand(1);
29649 EVT VT = N->getOperand(0).getValueType();
29650 unsigned RegSize = 128;
29651 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
29653 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
29654 // Shrink the operands of mul.
29655 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
29656 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
29658 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
29659 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
29660 // lower part is needed.
29661 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
29662 if (Mode == MULU8 || Mode == MULS8) {
29663 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
29666 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29667 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
29668 // the higher part is also needed.
29669 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29670 ReducedVT, NewN0, NewN1);
29672 // Repack the lower part and higher part result of mul into a wider
29674 // Generate shuffle functioning as punpcklwd.
29675 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
29676 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29677 ShuffleMask[2 * i] = i;
29678 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
29681 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29682 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
29683 // Generate shuffle functioning as punpckhwd.
29684 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29685 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
29686 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
29689 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29690 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
29691 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
29694 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
29695 // to legalize the mul explicitly because implicit legalization for type
29696 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
29697 // instructions which will not exist when we explicitly legalize it by
29698 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
29699 // <4 x i16> undef).
29701 // Legalize the operands of mul.
29702 // FIXME: We may be able to handle non-concatenated vectors by insertion.
29703 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
29704 if ((RegSize % ReducedSizeInBits) != 0)
29707 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
29708 DAG.getUNDEF(ReducedVT));
29710 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29712 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29714 if (Mode == MULU8 || Mode == MULS8) {
29715 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
29717 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29719 // convert the type of mul result to VT.
29720 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29721 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
29722 : ISD::SIGN_EXTEND_VECTOR_INREG,
29724 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29725 DAG.getIntPtrConstant(0, DL));
29727 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
29728 // MULU16/MULS16, both parts are needed.
29729 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29730 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29731 OpsVT, NewN0, NewN1);
29733 // Repack the lower part and higher part result of mul into a wider
29734 // result. Make sure the type of mul result is VT.
29735 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29736 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
29737 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
29738 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29739 DAG.getIntPtrConstant(0, DL));
29744 /// Optimize a single multiply with constant into two operations in order to
29745 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
29746 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
29747 TargetLowering::DAGCombinerInfo &DCI,
29748 const X86Subtarget &Subtarget) {
29749 EVT VT = N->getValueType(0);
29750 if (DCI.isBeforeLegalize() && VT.isVector())
29751 return reduceVMULWidth(N, DAG, Subtarget);
29753 // An imul is usually smaller than the alternative sequence.
29754 if (DAG.getMachineFunction().getFunction()->optForMinSize())
29757 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
29760 if (VT != MVT::i64 && VT != MVT::i32)
29763 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
29766 uint64_t MulAmt = C->getZExtValue();
29767 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
29770 uint64_t MulAmt1 = 0;
29771 uint64_t MulAmt2 = 0;
29772 if ((MulAmt % 9) == 0) {
29774 MulAmt2 = MulAmt / 9;
29775 } else if ((MulAmt % 5) == 0) {
29777 MulAmt2 = MulAmt / 5;
29778 } else if ((MulAmt % 3) == 0) {
29780 MulAmt2 = MulAmt / 3;
29786 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
29788 if (isPowerOf2_64(MulAmt2) &&
29789 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
29790 // If second multiplifer is pow2, issue it first. We want the multiply by
29791 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
29793 std::swap(MulAmt1, MulAmt2);
29795 if (isPowerOf2_64(MulAmt1))
29796 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
29797 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
29799 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
29800 DAG.getConstant(MulAmt1, DL, VT));
29802 if (isPowerOf2_64(MulAmt2))
29803 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
29804 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
29806 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
29807 DAG.getConstant(MulAmt2, DL, VT));
29811 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
29812 && "Both cases that could cause potential overflows should have "
29813 "already been handled.");
29814 if (isPowerOf2_64(MulAmt - 1))
29815 // (mul x, 2^N + 1) => (add (shl x, N), x)
29816 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
29817 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
29818 DAG.getConstant(Log2_64(MulAmt - 1), DL,
29821 else if (isPowerOf2_64(MulAmt + 1))
29822 // (mul x, 2^N - 1) => (sub (shl x, N), x)
29823 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
29825 DAG.getConstant(Log2_64(MulAmt + 1),
29826 DL, MVT::i8)), N->getOperand(0));
29830 // Do not add new nodes to DAG combiner worklist.
29831 DCI.CombineTo(N, NewMul, false);
29836 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
29837 SDValue N0 = N->getOperand(0);
29838 SDValue N1 = N->getOperand(1);
29839 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
29840 EVT VT = N0.getValueType();
29842 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
29843 // since the result of setcc_c is all zero's or all ones.
29844 if (VT.isInteger() && !VT.isVector() &&
29845 N1C && N0.getOpcode() == ISD::AND &&
29846 N0.getOperand(1).getOpcode() == ISD::Constant) {
29847 SDValue N00 = N0.getOperand(0);
29848 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
29849 const APInt &ShAmt = N1C->getAPIntValue();
29850 Mask = Mask.shl(ShAmt);
29851 bool MaskOK = false;
29852 // We can handle cases concerning bit-widening nodes containing setcc_c if
29853 // we carefully interrogate the mask to make sure we are semantics
29855 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
29856 // of the underlying setcc_c operation if the setcc_c was zero extended.
29857 // Consider the following example:
29858 // zext(setcc_c) -> i32 0x0000FFFF
29859 // c1 -> i32 0x0000FFFF
29860 // c2 -> i32 0x00000001
29861 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
29862 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
29863 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
29865 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
29866 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
29868 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
29869 N00.getOpcode() == ISD::ANY_EXTEND) &&
29870 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
29871 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
29873 if (MaskOK && Mask != 0) {
29875 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
29879 // Hardware support for vector shifts is sparse which makes us scalarize the
29880 // vector operations in many cases. Also, on sandybridge ADD is faster than
29882 // (shl V, 1) -> add V,V
29883 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
29884 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
29885 assert(N0.getValueType().isVector() && "Invalid vector shift type");
29886 // We shift all of the values by one. In many cases we do not have
29887 // hardware support for this operation. This is better expressed as an ADD
29889 if (N1SplatC->getAPIntValue() == 1)
29890 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
29896 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
29897 SDValue N0 = N->getOperand(0);
29898 SDValue N1 = N->getOperand(1);
29899 EVT VT = N0.getValueType();
29900 unsigned Size = VT.getSizeInBits();
29902 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
29903 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
29904 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
29905 // depending on sign of (SarConst - [56,48,32,24,16])
29907 // sexts in X86 are MOVs. The MOVs have the same code size
29908 // as above SHIFTs (only SHIFT on 1 has lower code size).
29909 // However the MOVs have 2 advantages to a SHIFT:
29910 // 1. MOVs can write to a register that differs from source
29911 // 2. MOVs accept memory operands
29913 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
29914 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
29915 N0.getOperand(1).getOpcode() != ISD::Constant)
29918 SDValue N00 = N0.getOperand(0);
29919 SDValue N01 = N0.getOperand(1);
29920 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
29921 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
29922 EVT CVT = N1.getValueType();
29924 if (SarConst.isNegative())
29927 for (MVT SVT : MVT::integer_valuetypes()) {
29928 unsigned ShiftSize = SVT.getSizeInBits();
29929 // skipping types without corresponding sext/zext and
29930 // ShlConst that is not one of [56,48,32,24,16]
29931 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
29935 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
29936 SarConst = SarConst - (Size - ShiftSize);
29939 else if (SarConst.isNegative())
29940 return DAG.getNode(ISD::SHL, DL, VT, NN,
29941 DAG.getConstant(-SarConst, DL, CVT));
29943 return DAG.getNode(ISD::SRA, DL, VT, NN,
29944 DAG.getConstant(SarConst, DL, CVT));
29949 /// \brief Returns a vector of 0s if the node in input is a vector logical
29950 /// shift by a constant amount which is known to be bigger than or equal
29951 /// to the vector element size in bits.
29952 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
29953 const X86Subtarget &Subtarget) {
29954 EVT VT = N->getValueType(0);
29956 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
29957 (!Subtarget.hasInt256() ||
29958 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
29961 SDValue Amt = N->getOperand(1);
29963 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
29964 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
29965 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
29966 unsigned MaxAmount =
29967 VT.getSimpleVT().getScalarSizeInBits();
29969 // SSE2/AVX2 logical shifts always return a vector of 0s
29970 // if the shift amount is bigger than or equal to
29971 // the element size. The constant shift amount will be
29972 // encoded as a 8-bit immediate.
29973 if (ShiftAmt.trunc(8).uge(MaxAmount))
29974 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
29980 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
29981 TargetLowering::DAGCombinerInfo &DCI,
29982 const X86Subtarget &Subtarget) {
29983 if (N->getOpcode() == ISD::SHL)
29984 if (SDValue V = combineShiftLeft(N, DAG))
29987 if (N->getOpcode() == ISD::SRA)
29988 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
29991 // Try to fold this logical shift into a zero vector.
29992 if (N->getOpcode() != ISD::SRA)
29993 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
29999 static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
30000 TargetLowering::DAGCombinerInfo &DCI,
30001 const X86Subtarget &Subtarget) {
30002 assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
30003 "Unexpected opcode");
30004 EVT VT = N->getValueType(0);
30005 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
30007 // This fails for mask register (vXi1) shifts.
30008 if ((NumBitsPerElt % 8) != 0)
30011 // Out of range logical bit shifts are guaranteed to be zero.
30012 APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
30013 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
30014 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30016 // Shift N0 by zero -> N0.
30018 return N->getOperand(0);
30020 // Shift zero -> zero.
30021 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
30022 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30024 // We can decode 'whole byte' logical bit shifts as shuffles.
30025 if ((ShiftVal.getZExtValue() % 8) == 0) {
30027 SmallVector<int, 1> NonceMask; // Just a placeholder.
30028 NonceMask.push_back(0);
30029 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30030 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30032 return SDValue(); // This routine will use CombineTo to replace N.
30038 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
30039 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
30040 /// OR -> CMPNEQSS.
30041 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
30042 TargetLowering::DAGCombinerInfo &DCI,
30043 const X86Subtarget &Subtarget) {
30046 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
30047 // we're requiring SSE2 for both.
30048 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
30049 SDValue N0 = N->getOperand(0);
30050 SDValue N1 = N->getOperand(1);
30051 SDValue CMP0 = N0->getOperand(1);
30052 SDValue CMP1 = N1->getOperand(1);
30055 // The SETCCs should both refer to the same CMP.
30056 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
30059 SDValue CMP00 = CMP0->getOperand(0);
30060 SDValue CMP01 = CMP0->getOperand(1);
30061 EVT VT = CMP00.getValueType();
30063 if (VT == MVT::f32 || VT == MVT::f64) {
30064 bool ExpectingFlags = false;
30065 // Check for any users that want flags:
30066 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
30067 !ExpectingFlags && UI != UE; ++UI)
30068 switch (UI->getOpcode()) {
30073 ExpectingFlags = true;
30075 case ISD::CopyToReg:
30076 case ISD::SIGN_EXTEND:
30077 case ISD::ZERO_EXTEND:
30078 case ISD::ANY_EXTEND:
30082 if (!ExpectingFlags) {
30083 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
30084 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
30086 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
30087 X86::CondCode tmp = cc0;
30092 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
30093 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
30094 // FIXME: need symbolic constants for these magic numbers.
30095 // See X86ATTInstPrinter.cpp:printSSECC().
30096 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
30097 if (Subtarget.hasAVX512()) {
30098 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
30100 DAG.getConstant(x86cc, DL, MVT::i8));
30101 if (N->getValueType(0) != MVT::i1)
30102 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
30106 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
30107 CMP00.getValueType(), CMP00, CMP01,
30108 DAG.getConstant(x86cc, DL,
30111 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
30112 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
30114 if (is64BitFP && !Subtarget.is64Bit()) {
30115 // On a 32-bit target, we cannot bitcast the 64-bit float to a
30116 // 64-bit integer, since that's not a legal type. Since
30117 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
30118 // bits, but can do this little dance to extract the lowest 32 bits
30119 // and work with those going forward.
30120 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
30122 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
30123 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
30124 Vector32, DAG.getIntPtrConstant(0, DL));
30128 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
30129 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
30130 DAG.getConstant(1, DL, IntVT));
30131 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
30133 return OneBitOfTruth;
30141 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
30142 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
30143 assert(N->getOpcode() == ISD::AND);
30145 EVT VT = N->getValueType(0);
30146 SDValue N0 = N->getOperand(0);
30147 SDValue N1 = N->getOperand(1);
30150 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
30153 // Canonicalize XOR to the left.
30154 if (N1.getOpcode() == ISD::XOR)
30157 if (N0.getOpcode() != ISD::XOR)
30160 SDValue N00 = N0->getOperand(0);
30161 SDValue N01 = N0->getOperand(1);
30163 N01 = peekThroughBitcasts(N01);
30165 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
30166 // insert_subvector building a 256-bit AllOnes vector.
30167 if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
30168 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
30171 SDValue V1 = N01->getOperand(0);
30172 SDValue V2 = N01->getOperand(1);
30173 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
30174 !V1.getOperand(0).isUndef() ||
30175 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
30176 !ISD::isBuildVectorAllOnes(V2.getNode()))
30179 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
30182 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
30183 // register. In most cases we actually compare or select YMM-sized registers
30184 // and mixing the two types creates horrible code. This method optimizes
30185 // some of the transition sequences.
30186 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
30187 TargetLowering::DAGCombinerInfo &DCI,
30188 const X86Subtarget &Subtarget) {
30189 EVT VT = N->getValueType(0);
30190 if (!VT.is256BitVector())
30193 assert((N->getOpcode() == ISD::ANY_EXTEND ||
30194 N->getOpcode() == ISD::ZERO_EXTEND ||
30195 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
30197 SDValue Narrow = N->getOperand(0);
30198 EVT NarrowVT = Narrow->getValueType(0);
30199 if (!NarrowVT.is128BitVector())
30202 if (Narrow->getOpcode() != ISD::XOR &&
30203 Narrow->getOpcode() != ISD::AND &&
30204 Narrow->getOpcode() != ISD::OR)
30207 SDValue N0 = Narrow->getOperand(0);
30208 SDValue N1 = Narrow->getOperand(1);
30211 // The Left side has to be a trunc.
30212 if (N0.getOpcode() != ISD::TRUNCATE)
30215 // The type of the truncated inputs.
30216 EVT WideVT = N0->getOperand(0)->getValueType(0);
30220 // The right side has to be a 'trunc' or a constant vector.
30221 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
30222 ConstantSDNode *RHSConstSplat = nullptr;
30223 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
30224 RHSConstSplat = RHSBV->getConstantSplatNode();
30225 if (!RHSTrunc && !RHSConstSplat)
30228 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30230 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
30233 // Set N0 and N1 to hold the inputs to the new wide operation.
30234 N0 = N0->getOperand(0);
30235 if (RHSConstSplat) {
30236 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
30237 SDValue(RHSConstSplat, 0));
30238 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
30239 } else if (RHSTrunc) {
30240 N1 = N1->getOperand(0);
30243 // Generate the wide operation.
30244 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
30245 unsigned Opcode = N->getOpcode();
30247 case ISD::ANY_EXTEND:
30249 case ISD::ZERO_EXTEND: {
30250 unsigned InBits = NarrowVT.getScalarSizeInBits();
30251 APInt Mask = APInt::getAllOnesValue(InBits);
30252 Mask = Mask.zext(VT.getScalarSizeInBits());
30253 return DAG.getNode(ISD::AND, DL, VT,
30254 Op, DAG.getConstant(Mask, DL, VT));
30256 case ISD::SIGN_EXTEND:
30257 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
30258 Op, DAG.getValueType(NarrowVT));
30260 llvm_unreachable("Unexpected opcode");
30264 /// If both input operands of a logic op are being cast from floating point
30265 /// types, try to convert this into a floating point logic node to avoid
30266 /// unnecessary moves from SSE to integer registers.
30267 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
30268 const X86Subtarget &Subtarget) {
30269 unsigned FPOpcode = ISD::DELETED_NODE;
30270 if (N->getOpcode() == ISD::AND)
30271 FPOpcode = X86ISD::FAND;
30272 else if (N->getOpcode() == ISD::OR)
30273 FPOpcode = X86ISD::FOR;
30274 else if (N->getOpcode() == ISD::XOR)
30275 FPOpcode = X86ISD::FXOR;
30277 assert(FPOpcode != ISD::DELETED_NODE &&
30278 "Unexpected input node for FP logic conversion");
30280 EVT VT = N->getValueType(0);
30281 SDValue N0 = N->getOperand(0);
30282 SDValue N1 = N->getOperand(1);
30284 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
30285 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
30286 (Subtarget.hasSSE2() && VT == MVT::i64))) {
30287 SDValue N00 = N0.getOperand(0);
30288 SDValue N10 = N1.getOperand(0);
30289 EVT N00Type = N00.getValueType();
30290 EVT N10Type = N10.getValueType();
30291 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
30292 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
30293 return DAG.getBitcast(VT, FPLogic);
30299 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
30300 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
30301 /// eliminate loading the vector constant mask value. This relies on the fact
30302 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
30303 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
30304 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
30305 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
30307 // TODO: Use AssertSext to mark any nodes that have the property of producing
30308 // all-ones or all-zeros. Then check for that node rather than particular
30310 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
30313 // The existence of the PCMP node guarantees that we have the required SSE2 or
30314 // AVX2 for a shift of this vector type, but there is no vector shift by
30315 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
30316 // masked compare nodes, so they should not make it here.
30317 EVT VT0 = Op0.getValueType();
30318 EVT VT1 = Op1.getValueType();
30319 unsigned EltBitWidth = VT0.getScalarSizeInBits();
30320 if (VT0 != VT1 || EltBitWidth == 8)
30323 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
30326 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
30330 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
30331 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
30332 return DAG.getBitcast(N->getValueType(0), Shift);
30335 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
30336 TargetLowering::DAGCombinerInfo &DCI,
30337 const X86Subtarget &Subtarget) {
30338 if (DCI.isBeforeLegalizeOps())
30341 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30344 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30347 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
30350 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
30353 EVT VT = N->getValueType(0);
30354 SDValue N0 = N->getOperand(0);
30355 SDValue N1 = N->getOperand(1);
30358 // Attempt to recursively combine a bitmask AND with shuffles.
30359 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
30361 SmallVector<int, 1> NonceMask; // Just a placeholder.
30362 NonceMask.push_back(0);
30363 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30364 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30366 return SDValue(); // This routine will use CombineTo to replace N.
30369 // Create BEXTR instructions
30370 // BEXTR is ((X >> imm) & (2**size-1))
30371 if (VT != MVT::i32 && VT != MVT::i64)
30374 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
30376 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
30379 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
30380 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
30381 if (MaskNode && ShiftNode) {
30382 uint64_t Mask = MaskNode->getZExtValue();
30383 uint64_t Shift = ShiftNode->getZExtValue();
30384 if (isMask_64(Mask)) {
30385 uint64_t MaskSize = countPopulation(Mask);
30386 if (Shift + MaskSize <= VT.getSizeInBits())
30387 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
30388 DAG.getConstant(Shift | (MaskSize << 8), DL,
30396 // (or (and (m, y), (pandn m, x)))
30398 // (vselect m, x, y)
30399 // As a special case, try to fold:
30400 // (or (and (m, (sub 0, x)), (pandn m, x)))
30402 // (sub (xor X, M), M)
30403 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
30404 const X86Subtarget &Subtarget) {
30405 assert(N->getOpcode() == ISD::OR);
30407 SDValue N0 = N->getOperand(0);
30408 SDValue N1 = N->getOperand(1);
30409 EVT VT = N->getValueType(0);
30411 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
30413 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
30415 // Canonicalize pandn to RHS
30416 if (N0.getOpcode() == X86ISD::ANDNP)
30419 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
30422 SDValue Mask = N1.getOperand(0);
30423 SDValue X = N1.getOperand(1);
30425 if (N0.getOperand(0) == Mask)
30426 Y = N0.getOperand(1);
30427 if (N0.getOperand(1) == Mask)
30428 Y = N0.getOperand(0);
30430 // Check to see if the mask appeared in both the AND and ANDNP.
30434 // Validate that X, Y, and Mask are bitcasts, and see through them.
30435 Mask = peekThroughBitcasts(Mask);
30436 X = peekThroughBitcasts(X);
30437 Y = peekThroughBitcasts(Y);
30439 EVT MaskVT = Mask.getValueType();
30441 // Validate that the Mask operand is a vector sra node.
30442 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
30443 // there is no psrai.b
30444 unsigned EltBits = MaskVT.getScalarSizeInBits();
30445 unsigned SraAmt = ~0;
30446 if (Mask.getOpcode() == ISD::SRA) {
30447 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
30448 if (auto *AmtConst = AmtBV->getConstantSplatNode())
30449 SraAmt = AmtConst->getZExtValue();
30450 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
30451 SDValue SraC = Mask.getOperand(1);
30452 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
30454 if ((SraAmt + 1) != EltBits)
30460 // (or (and (M, (sub 0, X)), (pandn M, X)))
30461 // which is a special case of vselect:
30462 // (vselect M, (sub 0, X), X)
30464 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
30465 // We know that, if fNegate is 0 or 1:
30466 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
30468 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
30469 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
30470 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
30471 // This lets us transform our vselect to:
30472 // (add (xor X, M), (and M, 1))
30474 // (sub (xor X, M), M)
30475 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
30476 auto IsNegV = [](SDNode *N, SDValue V) {
30477 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
30478 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
30481 if (IsNegV(Y.getNode(), X))
30483 else if (IsNegV(X.getNode(), Y))
30487 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
30488 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
30489 SDValue SubOp2 = Mask;
30491 // If the negate was on the false side of the select, then
30492 // the operands of the SUB need to be swapped. PR 27251.
30493 // This is because the pattern being matched above is
30494 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
30495 // but if the pattern matched was
30496 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
30497 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
30498 // pattern also needs to be a negation of the replacement pattern above.
30499 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
30500 // sub accomplishes the negation of the replacement pattern.
30502 std::swap(SubOp1, SubOp2);
30504 return DAG.getBitcast(VT,
30505 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
30509 // PBLENDVB is only available on SSE 4.1.
30510 if (!Subtarget.hasSSE41())
30513 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
30515 X = DAG.getBitcast(BlendVT, X);
30516 Y = DAG.getBitcast(BlendVT, Y);
30517 Mask = DAG.getBitcast(BlendVT, Mask);
30518 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
30519 return DAG.getBitcast(VT, Mask);
30522 // Helper function for combineOrCmpEqZeroToCtlzSrl
30526 // srl(ctlz x), log2(bitsize(x))
30527 // Input pattern is checked by caller.
30528 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
30529 SelectionDAG &DAG) {
30530 SDValue Cmp = Op.getOperand(1);
30531 EVT VT = Cmp.getOperand(0).getValueType();
30532 unsigned Log2b = Log2_32(VT.getSizeInBits());
30534 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
30535 // The result of the shift is true or false, and on X86, the 32-bit
30536 // encoding of shr and lzcnt is more desirable.
30537 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
30538 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
30539 DAG.getConstant(Log2b, dl, VT));
30540 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
30543 // Try to transform:
30544 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
30546 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
30547 // Will also attempt to match more generic cases, eg:
30548 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
30549 // Only applies if the target supports the FastLZCNT feature.
30550 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
30551 TargetLowering::DAGCombinerInfo &DCI,
30552 const X86Subtarget &Subtarget) {
30553 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
30556 auto isORCandidate = [](SDValue N) {
30557 return (N->getOpcode() == ISD::OR && N->hasOneUse());
30560 // Check the zero extend is extending to 32-bit or more. The code generated by
30561 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
30562 // instructions to clear the upper bits.
30563 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
30564 !isORCandidate(N->getOperand(0)))
30567 // Check the node matches: setcc(eq, cmp 0)
30568 auto isSetCCCandidate = [](SDValue N) {
30569 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
30570 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
30571 N->getOperand(1).getOpcode() == X86ISD::CMP &&
30572 N->getOperand(1).getConstantOperandVal(1) == 0 &&
30573 N->getOperand(1).getValueType().bitsGE(MVT::i32);
30576 SDNode *OR = N->getOperand(0).getNode();
30577 SDValue LHS = OR->getOperand(0);
30578 SDValue RHS = OR->getOperand(1);
30580 // Save nodes matching or(or, setcc(eq, cmp 0)).
30581 SmallVector<SDNode *, 2> ORNodes;
30582 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
30583 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
30584 ORNodes.push_back(OR);
30585 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
30586 LHS = OR->getOperand(0);
30587 RHS = OR->getOperand(1);
30590 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
30591 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
30592 !isORCandidate(SDValue(OR, 0)))
30595 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
30597 // or(srl(ctlz),srl(ctlz)).
30598 // The dag combiner can then fold it into:
30599 // srl(or(ctlz, ctlz)).
30600 EVT VT = OR->getValueType(0);
30601 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
30602 SDValue Ret, NewRHS;
30603 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
30604 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
30609 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
30610 while (ORNodes.size() > 0) {
30611 OR = ORNodes.pop_back_val();
30612 LHS = OR->getOperand(0);
30613 RHS = OR->getOperand(1);
30614 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
30615 if (RHS->getOpcode() == ISD::OR)
30616 std::swap(LHS, RHS);
30617 EVT VT = OR->getValueType(0);
30618 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
30621 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
30625 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
30630 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
30631 TargetLowering::DAGCombinerInfo &DCI,
30632 const X86Subtarget &Subtarget) {
30633 if (DCI.isBeforeLegalizeOps())
30636 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30639 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30642 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
30645 SDValue N0 = N->getOperand(0);
30646 SDValue N1 = N->getOperand(1);
30647 EVT VT = N->getValueType(0);
30649 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
30652 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
30653 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
30655 // SHLD/SHRD instructions have lower register pressure, but on some
30656 // platforms they have higher latency than the equivalent
30657 // series of shifts/or that would otherwise be generated.
30658 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
30659 // have higher latencies and we are not optimizing for size.
30660 if (!OptForSize && Subtarget.isSHLDSlow())
30663 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
30665 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
30667 if (!N0.hasOneUse() || !N1.hasOneUse())
30670 SDValue ShAmt0 = N0.getOperand(1);
30671 if (ShAmt0.getValueType() != MVT::i8)
30673 SDValue ShAmt1 = N1.getOperand(1);
30674 if (ShAmt1.getValueType() != MVT::i8)
30676 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
30677 ShAmt0 = ShAmt0.getOperand(0);
30678 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
30679 ShAmt1 = ShAmt1.getOperand(0);
30682 unsigned Opc = X86ISD::SHLD;
30683 SDValue Op0 = N0.getOperand(0);
30684 SDValue Op1 = N1.getOperand(0);
30685 if (ShAmt0.getOpcode() == ISD::SUB ||
30686 ShAmt0.getOpcode() == ISD::XOR) {
30687 Opc = X86ISD::SHRD;
30688 std::swap(Op0, Op1);
30689 std::swap(ShAmt0, ShAmt1);
30692 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
30693 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
30694 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
30695 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
30696 unsigned Bits = VT.getSizeInBits();
30697 if (ShAmt1.getOpcode() == ISD::SUB) {
30698 SDValue Sum = ShAmt1.getOperand(0);
30699 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
30700 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
30701 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
30702 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
30703 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
30704 return DAG.getNode(Opc, DL, VT,
30706 DAG.getNode(ISD::TRUNCATE, DL,
30709 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
30710 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
30711 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
30712 return DAG.getNode(Opc, DL, VT,
30713 N0.getOperand(0), N1.getOperand(0),
30714 DAG.getNode(ISD::TRUNCATE, DL,
30716 } else if (ShAmt1.getOpcode() == ISD::XOR) {
30717 SDValue Mask = ShAmt1.getOperand(1);
30718 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
30719 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
30720 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
30721 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
30722 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
30723 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
30724 if (Op1.getOpcode() == InnerShift &&
30725 isa<ConstantSDNode>(Op1.getOperand(1)) &&
30726 Op1.getConstantOperandVal(1) == 1) {
30727 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30728 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30730 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
30731 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
30732 Op1.getOperand(0) == Op1.getOperand(1)) {
30733 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30734 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30743 /// Generate NEG and CMOV for integer abs.
30744 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
30745 EVT VT = N->getValueType(0);
30747 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30748 // 8-bit integer abs to NEG and CMOV.
30749 if (VT.isInteger() && VT.getSizeInBits() == 8)
30752 SDValue N0 = N->getOperand(0);
30753 SDValue N1 = N->getOperand(1);
30756 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
30757 // and change it to SUB and CMOV.
30758 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
30759 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
30760 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
30761 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
30762 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
30763 // Generate SUB & CMOV.
30764 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30765 DAG.getConstant(0, DL, VT), N0.getOperand(0));
30766 SDValue Ops[] = {N0.getOperand(0), Neg,
30767 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
30768 SDValue(Neg.getNode(), 1)};
30769 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
30775 /// Try to turn tests against the signbit in the form of:
30776 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
30779 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
30780 // This is only worth doing if the output type is i8 or i1.
30781 EVT ResultType = N->getValueType(0);
30782 if (ResultType != MVT::i8 && ResultType != MVT::i1)
30785 SDValue N0 = N->getOperand(0);
30786 SDValue N1 = N->getOperand(1);
30788 // We should be performing an xor against a truncated shift.
30789 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
30792 // Make sure we are performing an xor against one.
30793 if (!isOneConstant(N1))
30796 // SetCC on x86 zero extends so only act on this if it's a logical shift.
30797 SDValue Shift = N0.getOperand(0);
30798 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
30801 // Make sure we are truncating from one of i16, i32 or i64.
30802 EVT ShiftTy = Shift.getValueType();
30803 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
30806 // Make sure the shift amount extracts the sign bit.
30807 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
30808 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
30811 // Create a greater-than comparison against -1.
30812 // N.B. Using SETGE against 0 works but we want a canonical looking
30813 // comparison, using SETGT matches up with what TranslateX86CC.
30815 SDValue ShiftOp = Shift.getOperand(0);
30816 EVT ShiftOpTy = ShiftOp.getValueType();
30817 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30818 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
30819 *DAG.getContext(), ResultType);
30820 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
30821 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
30822 if (SetCCResultType != ResultType)
30823 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
30827 /// Turn vector tests of the signbit in the form of:
30828 /// xor (sra X, elt_size(X)-1), -1
30832 /// This should be called before type legalization because the pattern may not
30833 /// persist after that.
30834 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
30835 const X86Subtarget &Subtarget) {
30836 EVT VT = N->getValueType(0);
30837 if (!VT.isSimple())
30840 switch (VT.getSimpleVT().SimpleTy) {
30841 default: return SDValue();
30844 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
30845 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
30849 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
30852 // There must be a shift right algebraic before the xor, and the xor must be a
30853 // 'not' operation.
30854 SDValue Shift = N->getOperand(0);
30855 SDValue Ones = N->getOperand(1);
30856 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
30857 !ISD::isBuildVectorAllOnes(Ones.getNode()))
30860 // The shift should be smearing the sign bit across each vector element.
30861 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
30865 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
30866 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
30867 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
30870 // Create a greater-than comparison against -1. We don't use the more obvious
30871 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
30872 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
30875 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
30876 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
30877 /// X86ISD::AVG instruction.
30878 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
30879 const X86Subtarget &Subtarget,
30881 if (!VT.isVector() || !VT.isSimple())
30883 EVT InVT = In.getValueType();
30884 unsigned NumElems = VT.getVectorNumElements();
30886 EVT ScalarVT = VT.getVectorElementType();
30887 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
30888 isPowerOf2_32(NumElems)))
30891 // InScalarVT is the intermediate type in AVG pattern and it should be greater
30892 // than the original input type (i8/i16).
30893 EVT InScalarVT = InVT.getVectorElementType();
30894 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
30897 if (!Subtarget.hasSSE2())
30899 if (Subtarget.hasBWI()) {
30900 if (VT.getSizeInBits() > 512)
30902 } else if (Subtarget.hasAVX2()) {
30903 if (VT.getSizeInBits() > 256)
30906 if (VT.getSizeInBits() > 128)
30910 // Detect the following pattern:
30912 // %1 = zext <N x i8> %a to <N x i32>
30913 // %2 = zext <N x i8> %b to <N x i32>
30914 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
30915 // %4 = add nuw nsw <N x i32> %3, %2
30916 // %5 = lshr <N x i32> %N, <i32 1 x N>
30917 // %6 = trunc <N x i32> %5 to <N x i8>
30919 // In AVX512, the last instruction can also be a trunc store.
30921 if (In.getOpcode() != ISD::SRL)
30924 // A lambda checking the given SDValue is a constant vector and each element
30925 // is in the range [Min, Max].
30926 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
30927 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
30928 if (!BV || !BV->isConstant())
30930 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
30931 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
30934 uint64_t Val = C->getZExtValue();
30935 if (Val < Min || Val > Max)
30941 // Check if each element of the vector is left-shifted by one.
30942 auto LHS = In.getOperand(0);
30943 auto RHS = In.getOperand(1);
30944 if (!IsConstVectorInRange(RHS, 1, 1))
30946 if (LHS.getOpcode() != ISD::ADD)
30949 // Detect a pattern of a + b + 1 where the order doesn't matter.
30950 SDValue Operands[3];
30951 Operands[0] = LHS.getOperand(0);
30952 Operands[1] = LHS.getOperand(1);
30954 // Take care of the case when one of the operands is a constant vector whose
30955 // element is in the range [1, 256].
30956 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
30957 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
30958 Operands[0].getOperand(0).getValueType() == VT) {
30959 // The pattern is detected. Subtract one from the constant vector, then
30960 // demote it and emit X86ISD::AVG instruction.
30961 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
30962 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
30963 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
30964 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
30968 if (Operands[0].getOpcode() == ISD::ADD)
30969 std::swap(Operands[0], Operands[1]);
30970 else if (Operands[1].getOpcode() != ISD::ADD)
30972 Operands[2] = Operands[1].getOperand(0);
30973 Operands[1] = Operands[1].getOperand(1);
30975 // Now we have three operands of two additions. Check that one of them is a
30976 // constant vector with ones, and the other two are promoted from i8/i16.
30977 for (int i = 0; i < 3; ++i) {
30978 if (!IsConstVectorInRange(Operands[i], 1, 1))
30980 std::swap(Operands[i], Operands[2]);
30982 // Check if Operands[0] and Operands[1] are results of type promotion.
30983 for (int j = 0; j < 2; ++j)
30984 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
30985 Operands[j].getOperand(0).getValueType() != VT)
30988 // The pattern is detected, emit X86ISD::AVG instruction.
30989 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
30990 Operands[1].getOperand(0));
30996 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
30997 TargetLowering::DAGCombinerInfo &DCI,
30998 const X86Subtarget &Subtarget) {
30999 LoadSDNode *Ld = cast<LoadSDNode>(N);
31000 EVT RegVT = Ld->getValueType(0);
31001 EVT MemVT = Ld->getMemoryVT();
31003 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31005 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
31006 // into two 16-byte operations.
31007 ISD::LoadExtType Ext = Ld->getExtensionType();
31009 unsigned AddressSpace = Ld->getAddressSpace();
31010 unsigned Alignment = Ld->getAlignment();
31011 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
31012 Ext == ISD::NON_EXTLOAD &&
31013 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
31014 AddressSpace, Alignment, &Fast) && !Fast) {
31015 unsigned NumElems = RegVT.getVectorNumElements();
31019 SDValue Ptr = Ld->getBasePtr();
31021 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
31024 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31025 Alignment, Ld->getMemOperand()->getFlags());
31027 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
31029 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31030 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
31031 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31033 Load2.getValue(1));
31035 SDValue NewVec = DAG.getUNDEF(RegVT);
31036 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
31037 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
31038 return DCI.CombineTo(N, NewVec, TF, true);
31044 /// If V is a build vector of boolean constants and exactly one of those
31045 /// constants is true, return the operand index of that true element.
31046 /// Otherwise, return -1.
31047 static int getOneTrueElt(SDValue V) {
31048 // This needs to be a build vector of booleans.
31049 // TODO: Checking for the i1 type matches the IR definition for the mask,
31050 // but the mask check could be loosened to i8 or other types. That might
31051 // also require checking more than 'allOnesValue'; eg, the x86 HW
31052 // instructions only require that the MSB is set for each mask element.
31053 // The ISD::MSTORE comments/definition do not specify how the mask operand
31055 auto *BV = dyn_cast<BuildVectorSDNode>(V);
31056 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
31059 int TrueIndex = -1;
31060 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
31061 for (unsigned i = 0; i < NumElts; ++i) {
31062 const SDValue &Op = BV->getOperand(i);
31065 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
31068 if (ConstNode->getAPIntValue().isAllOnesValue()) {
31069 // If we already found a one, this is too many.
31070 if (TrueIndex >= 0)
31078 /// Given a masked memory load/store operation, return true if it has one mask
31079 /// bit set. If it has one mask bit set, then also return the memory address of
31080 /// the scalar element to load/store, the vector index to insert/extract that
31081 /// scalar element, and the alignment for the scalar memory access.
31082 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
31083 SelectionDAG &DAG, SDValue &Addr,
31084 SDValue &Index, unsigned &Alignment) {
31085 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
31086 if (TrueMaskElt < 0)
31089 // Get the address of the one scalar element that is specified by the mask
31090 // using the appropriate offset from the base pointer.
31091 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
31092 Addr = MaskedOp->getBasePtr();
31093 if (TrueMaskElt != 0) {
31094 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
31095 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
31098 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
31099 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
31103 /// If exactly one element of the mask is set for a non-extending masked load,
31104 /// it is a scalar load and vector insert.
31105 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31106 /// mask have already been optimized in IR, so we don't bother with those here.
31108 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31109 TargetLowering::DAGCombinerInfo &DCI) {
31110 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31111 // However, some target hooks may need to be added to know when the transform
31112 // is profitable. Endianness would also have to be considered.
31114 SDValue Addr, VecIndex;
31115 unsigned Alignment;
31116 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
31119 // Load the one scalar element that is specified by the mask using the
31120 // appropriate offset from the base pointer.
31122 EVT VT = ML->getValueType(0);
31123 EVT EltVT = VT.getVectorElementType();
31125 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
31126 Alignment, ML->getMemOperand()->getFlags());
31128 // Insert the loaded element into the appropriate place in the vector.
31129 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
31131 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
31135 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31136 TargetLowering::DAGCombinerInfo &DCI) {
31137 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
31141 EVT VT = ML->getValueType(0);
31143 // If we are loading the first and last elements of a vector, it is safe and
31144 // always faster to load the whole vector. Replace the masked load with a
31145 // vector load and select.
31146 unsigned NumElts = VT.getVectorNumElements();
31147 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
31148 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
31149 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
31150 if (LoadFirstElt && LoadLastElt) {
31151 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31152 ML->getMemOperand());
31153 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
31154 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
31157 // Convert a masked load with a constant mask into a masked load and a select.
31158 // This allows the select operation to use a faster kind of select instruction
31159 // (for example, vblendvps -> vblendps).
31161 // Don't try this if the pass-through operand is already undefined. That would
31162 // cause an infinite loop because that's what we're about to create.
31163 if (ML->getSrc0().isUndef())
31166 // The new masked load has an undef pass-through operand. The select uses the
31167 // original pass-through operand.
31168 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31169 ML->getMask(), DAG.getUNDEF(VT),
31170 ML->getMemoryVT(), ML->getMemOperand(),
31171 ML->getExtensionType());
31172 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
31174 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
31177 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
31178 TargetLowering::DAGCombinerInfo &DCI,
31179 const X86Subtarget &Subtarget) {
31180 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
31182 // TODO: Expanding load with constant mask may be optimized as well.
31183 if (Mld->isExpandingLoad())
31186 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
31187 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
31189 // TODO: Do some AVX512 subsets benefit from this transform?
31190 if (!Subtarget.hasAVX512())
31191 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
31195 if (Mld->getExtensionType() != ISD::SEXTLOAD)
31198 // Resolve extending loads.
31199 EVT VT = Mld->getValueType(0);
31200 unsigned NumElems = VT.getVectorNumElements();
31201 EVT LdVT = Mld->getMemoryVT();
31204 assert(LdVT != VT && "Cannot extend to the same type");
31205 unsigned ToSz = VT.getScalarSizeInBits();
31206 unsigned FromSz = LdVT.getScalarSizeInBits();
31207 // From/To sizes and ElemCount must be pow of two.
31208 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31209 "Unexpected size for extending masked load");
31211 unsigned SizeRatio = ToSz / FromSz;
31212 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
31214 // Create a type on which we perform the shuffle.
31215 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31216 LdVT.getScalarType(), NumElems*SizeRatio);
31217 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31219 // Convert Src0 value.
31220 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
31221 if (!Mld->getSrc0().isUndef()) {
31222 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31223 for (unsigned i = 0; i != NumElems; ++i)
31224 ShuffleVec[i] = i * SizeRatio;
31226 // Can't shuffle using an illegal type.
31227 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31228 "WideVecVT should be legal");
31229 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
31230 DAG.getUNDEF(WideVecVT), ShuffleVec);
31232 // Prepare the new mask.
31234 SDValue Mask = Mld->getMask();
31235 if (Mask.getValueType() == VT) {
31236 // Mask and original value have the same type.
31237 NewMask = DAG.getBitcast(WideVecVT, Mask);
31238 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31239 for (unsigned i = 0; i != NumElems; ++i)
31240 ShuffleVec[i] = i * SizeRatio;
31241 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
31242 ShuffleVec[i] = NumElems * SizeRatio;
31243 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31244 DAG.getConstant(0, dl, WideVecVT),
31247 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31248 unsigned WidenNumElts = NumElems*SizeRatio;
31249 unsigned MaskNumElts = VT.getVectorNumElements();
31250 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31253 unsigned NumConcat = WidenNumElts / MaskNumElts;
31254 SmallVector<SDValue, 16> Ops(NumConcat);
31255 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31257 for (unsigned i = 1; i != NumConcat; ++i)
31260 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31263 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
31264 Mld->getBasePtr(), NewMask, WideSrc0,
31265 Mld->getMemoryVT(), Mld->getMemOperand(),
31267 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
31268 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
31271 /// If exactly one element of the mask is set for a non-truncating masked store,
31272 /// it is a vector extract and scalar store.
31273 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31274 /// mask have already been optimized in IR, so we don't bother with those here.
31275 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
31276 SelectionDAG &DAG) {
31277 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31278 // However, some target hooks may need to be added to know when the transform
31279 // is profitable. Endianness would also have to be considered.
31281 SDValue Addr, VecIndex;
31282 unsigned Alignment;
31283 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
31286 // Extract the one scalar element that is actually being stored.
31288 EVT VT = MS->getValue().getValueType();
31289 EVT EltVT = VT.getVectorElementType();
31290 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
31291 MS->getValue(), VecIndex);
31293 // Store that element at the appropriate offset from the base pointer.
31294 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
31295 Alignment, MS->getMemOperand()->getFlags());
31298 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
31299 const X86Subtarget &Subtarget) {
31300 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
31302 if (Mst->isCompressingStore())
31305 if (!Mst->isTruncatingStore())
31306 return reduceMaskedStoreToScalarStore(Mst, DAG);
31308 // Resolve truncating stores.
31309 EVT VT = Mst->getValue().getValueType();
31310 unsigned NumElems = VT.getVectorNumElements();
31311 EVT StVT = Mst->getMemoryVT();
31314 assert(StVT != VT && "Cannot truncate to the same type");
31315 unsigned FromSz = VT.getScalarSizeInBits();
31316 unsigned ToSz = StVT.getScalarSizeInBits();
31318 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31320 // The truncating store is legal in some cases. For example
31321 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31322 // are designated for truncate store.
31323 // In this case we don't need any further transformations.
31324 if (TLI.isTruncStoreLegal(VT, StVT))
31327 // From/To sizes and ElemCount must be pow of two.
31328 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31329 "Unexpected size for truncating masked store");
31330 // We are going to use the original vector elt for storing.
31331 // Accumulated smaller vector elements must be a multiple of the store size.
31332 assert (((NumElems * FromSz) % ToSz) == 0 &&
31333 "Unexpected ratio for truncating masked store");
31335 unsigned SizeRatio = FromSz / ToSz;
31336 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31338 // Create a type on which we perform the shuffle.
31339 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31340 StVT.getScalarType(), NumElems*SizeRatio);
31342 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31344 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
31345 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31346 for (unsigned i = 0; i != NumElems; ++i)
31347 ShuffleVec[i] = i * SizeRatio;
31349 // Can't shuffle using an illegal type.
31350 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31351 "WideVecVT should be legal");
31353 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31354 DAG.getUNDEF(WideVecVT),
31358 SDValue Mask = Mst->getMask();
31359 if (Mask.getValueType() == VT) {
31360 // Mask and original value have the same type.
31361 NewMask = DAG.getBitcast(WideVecVT, Mask);
31362 for (unsigned i = 0; i != NumElems; ++i)
31363 ShuffleVec[i] = i * SizeRatio;
31364 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
31365 ShuffleVec[i] = NumElems*SizeRatio;
31366 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31367 DAG.getConstant(0, dl, WideVecVT),
31370 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31371 unsigned WidenNumElts = NumElems*SizeRatio;
31372 unsigned MaskNumElts = VT.getVectorNumElements();
31373 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31376 unsigned NumConcat = WidenNumElts / MaskNumElts;
31377 SmallVector<SDValue, 16> Ops(NumConcat);
31378 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31380 for (unsigned i = 1; i != NumConcat; ++i)
31383 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31386 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
31387 Mst->getBasePtr(), NewMask, StVT,
31388 Mst->getMemOperand(), false);
31391 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
31392 const X86Subtarget &Subtarget) {
31393 StoreSDNode *St = cast<StoreSDNode>(N);
31394 EVT VT = St->getValue().getValueType();
31395 EVT StVT = St->getMemoryVT();
31397 SDValue StoredVal = St->getOperand(1);
31398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31400 // If we are saving a concatenation of two XMM registers and 32-byte stores
31401 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
31403 unsigned AddressSpace = St->getAddressSpace();
31404 unsigned Alignment = St->getAlignment();
31405 if (VT.is256BitVector() && StVT == VT &&
31406 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
31407 AddressSpace, Alignment, &Fast) &&
31409 unsigned NumElems = VT.getVectorNumElements();
31413 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
31414 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
31416 SDValue Ptr0 = St->getBasePtr();
31417 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
31420 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
31421 Alignment, St->getMemOperand()->getFlags());
31423 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
31424 std::min(16U, Alignment), St->getMemOperand()->getFlags());
31425 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
31428 // Optimize trunc store (of multiple scalars) to shuffle and store.
31429 // First, pack all of the elements in one place. Next, store to memory
31430 // in fewer chunks.
31431 if (St->isTruncatingStore() && VT.isVector()) {
31432 // Check if we can detect an AVG pattern from the truncation. If yes,
31433 // replace the trunc store by a normal store with the result of X86ISD::AVG
31435 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
31437 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
31438 St->getPointerInfo(), St->getAlignment(),
31439 St->getMemOperand()->getFlags());
31441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31442 unsigned NumElems = VT.getVectorNumElements();
31443 assert(StVT != VT && "Cannot truncate to the same type");
31444 unsigned FromSz = VT.getScalarSizeInBits();
31445 unsigned ToSz = StVT.getScalarSizeInBits();
31447 // The truncating store is legal in some cases. For example
31448 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31449 // are designated for truncate store.
31450 // In this case we don't need any further transformations.
31451 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
31454 // From, To sizes and ElemCount must be pow of two
31455 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
31456 // We are going to use the original vector elt for storing.
31457 // Accumulated smaller vector elements must be a multiple of the store size.
31458 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
31460 unsigned SizeRatio = FromSz / ToSz;
31462 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31464 // Create a type on which we perform the shuffle
31465 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31466 StVT.getScalarType(), NumElems*SizeRatio);
31468 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31470 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
31471 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
31472 for (unsigned i = 0; i != NumElems; ++i)
31473 ShuffleVec[i] = i * SizeRatio;
31475 // Can't shuffle using an illegal type.
31476 if (!TLI.isTypeLegal(WideVecVT))
31479 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31480 DAG.getUNDEF(WideVecVT),
31482 // At this point all of the data is stored at the bottom of the
31483 // register. We now need to save it to mem.
31485 // Find the largest store unit
31486 MVT StoreType = MVT::i8;
31487 for (MVT Tp : MVT::integer_valuetypes()) {
31488 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
31492 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
31493 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
31494 (64 <= NumElems * ToSz))
31495 StoreType = MVT::f64;
31497 // Bitcast the original vector into a vector of store-size units
31498 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
31499 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
31500 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
31501 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
31502 SmallVector<SDValue, 8> Chains;
31503 SDValue Ptr = St->getBasePtr();
31505 // Perform one or more big stores into memory.
31506 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
31507 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
31508 StoreType, ShuffWide,
31509 DAG.getIntPtrConstant(i, dl));
31511 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
31512 St->getAlignment(), St->getMemOperand()->getFlags());
31513 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
31514 Chains.push_back(Ch);
31517 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
31520 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
31521 // the FP state in cases where an emms may be missing.
31522 // A preferable solution to the general problem is to figure out the right
31523 // places to insert EMMS. This qualifies as a quick hack.
31525 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
31526 if (VT.getSizeInBits() != 64)
31529 const Function *F = DAG.getMachineFunction().getFunction();
31530 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
31532 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
31533 if ((VT.isVector() ||
31534 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
31535 isa<LoadSDNode>(St->getValue()) &&
31536 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
31537 St->getChain().hasOneUse() && !St->isVolatile()) {
31538 SDNode* LdVal = St->getValue().getNode();
31539 LoadSDNode *Ld = nullptr;
31540 int TokenFactorIndex = -1;
31541 SmallVector<SDValue, 8> Ops;
31542 SDNode* ChainVal = St->getChain().getNode();
31543 // Must be a store of a load. We currently handle two cases: the load
31544 // is a direct child, and it's under an intervening TokenFactor. It is
31545 // possible to dig deeper under nested TokenFactors.
31546 if (ChainVal == LdVal)
31547 Ld = cast<LoadSDNode>(St->getChain());
31548 else if (St->getValue().hasOneUse() &&
31549 ChainVal->getOpcode() == ISD::TokenFactor) {
31550 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
31551 if (ChainVal->getOperand(i).getNode() == LdVal) {
31552 TokenFactorIndex = i;
31553 Ld = cast<LoadSDNode>(St->getValue());
31555 Ops.push_back(ChainVal->getOperand(i));
31559 if (!Ld || !ISD::isNormalLoad(Ld))
31562 // If this is not the MMX case, i.e. we are just turning i64 load/store
31563 // into f64 load/store, avoid the transformation if there are multiple
31564 // uses of the loaded value.
31565 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
31570 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
31571 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
31573 if (Subtarget.is64Bit() || F64IsLegal) {
31574 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
31575 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
31576 Ld->getPointerInfo(), Ld->getAlignment(),
31577 Ld->getMemOperand()->getFlags());
31578 SDValue NewChain = NewLd.getValue(1);
31579 if (TokenFactorIndex >= 0) {
31580 Ops.push_back(NewChain);
31581 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31583 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
31584 St->getPointerInfo(), St->getAlignment(),
31585 St->getMemOperand()->getFlags());
31588 // Otherwise, lower to two pairs of 32-bit loads / stores.
31589 SDValue LoAddr = Ld->getBasePtr();
31590 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
31592 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
31593 Ld->getPointerInfo(), Ld->getAlignment(),
31594 Ld->getMemOperand()->getFlags());
31595 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
31596 Ld->getPointerInfo().getWithOffset(4),
31597 MinAlign(Ld->getAlignment(), 4),
31598 Ld->getMemOperand()->getFlags());
31600 SDValue NewChain = LoLd.getValue(1);
31601 if (TokenFactorIndex >= 0) {
31602 Ops.push_back(LoLd);
31603 Ops.push_back(HiLd);
31604 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31607 LoAddr = St->getBasePtr();
31608 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
31611 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
31612 St->getAlignment(), St->getMemOperand()->getFlags());
31613 SDValue HiSt = DAG.getStore(
31614 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
31615 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
31616 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
31619 // This is similar to the above case, but here we handle a scalar 64-bit
31620 // integer store that is extracted from a vector on a 32-bit target.
31621 // If we have SSE2, then we can treat it like a floating-point double
31622 // to get past legalization. The execution dependencies fixup pass will
31623 // choose the optimal machine instruction for the store if this really is
31624 // an integer or v2f32 rather than an f64.
31625 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
31626 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
31627 SDValue OldExtract = St->getOperand(1);
31628 SDValue ExtOp0 = OldExtract.getOperand(0);
31629 unsigned VecSize = ExtOp0.getValueSizeInBits();
31630 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
31631 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
31632 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
31633 BitCast, OldExtract.getOperand(1));
31634 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
31635 St->getPointerInfo(), St->getAlignment(),
31636 St->getMemOperand()->getFlags());
31642 /// Return 'true' if this vector operation is "horizontal"
31643 /// and return the operands for the horizontal operation in LHS and RHS. A
31644 /// horizontal operation performs the binary operation on successive elements
31645 /// of its first operand, then on successive elements of its second operand,
31646 /// returning the resulting values in a vector. For example, if
31647 /// A = < float a0, float a1, float a2, float a3 >
31649 /// B = < float b0, float b1, float b2, float b3 >
31650 /// then the result of doing a horizontal operation on A and B is
31651 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
31652 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
31653 /// A horizontal-op B, for some already available A and B, and if so then LHS is
31654 /// set to A, RHS to B, and the routine returns 'true'.
31655 /// Note that the binary operation should have the property that if one of the
31656 /// operands is UNDEF then the result is UNDEF.
31657 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
31658 // Look for the following pattern: if
31659 // A = < float a0, float a1, float a2, float a3 >
31660 // B = < float b0, float b1, float b2, float b3 >
31662 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
31663 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
31664 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
31665 // which is A horizontal-op B.
31667 // At least one of the operands should be a vector shuffle.
31668 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
31669 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
31672 MVT VT = LHS.getSimpleValueType();
31674 assert((VT.is128BitVector() || VT.is256BitVector()) &&
31675 "Unsupported vector type for horizontal add/sub");
31677 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
31678 // operate independently on 128-bit lanes.
31679 unsigned NumElts = VT.getVectorNumElements();
31680 unsigned NumLanes = VT.getSizeInBits()/128;
31681 unsigned NumLaneElts = NumElts / NumLanes;
31682 assert((NumLaneElts % 2 == 0) &&
31683 "Vector type should have an even number of elements in each lane");
31684 unsigned HalfLaneElts = NumLaneElts/2;
31686 // View LHS in the form
31687 // LHS = VECTOR_SHUFFLE A, B, LMask
31688 // If LHS is not a shuffle then pretend it is the shuffle
31689 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
31690 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
31693 SmallVector<int, 16> LMask(NumElts);
31694 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31695 if (!LHS.getOperand(0).isUndef())
31696 A = LHS.getOperand(0);
31697 if (!LHS.getOperand(1).isUndef())
31698 B = LHS.getOperand(1);
31699 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
31700 std::copy(Mask.begin(), Mask.end(), LMask.begin());
31702 if (!LHS.isUndef())
31704 for (unsigned i = 0; i != NumElts; ++i)
31708 // Likewise, view RHS in the form
31709 // RHS = VECTOR_SHUFFLE C, D, RMask
31711 SmallVector<int, 16> RMask(NumElts);
31712 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31713 if (!RHS.getOperand(0).isUndef())
31714 C = RHS.getOperand(0);
31715 if (!RHS.getOperand(1).isUndef())
31716 D = RHS.getOperand(1);
31717 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
31718 std::copy(Mask.begin(), Mask.end(), RMask.begin());
31720 if (!RHS.isUndef())
31722 for (unsigned i = 0; i != NumElts; ++i)
31726 // Check that the shuffles are both shuffling the same vectors.
31727 if (!(A == C && B == D) && !(A == D && B == C))
31730 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
31731 if (!A.getNode() && !B.getNode())
31734 // If A and B occur in reverse order in RHS, then "swap" them (which means
31735 // rewriting the mask).
31737 ShuffleVectorSDNode::commuteMask(RMask);
31739 // At this point LHS and RHS are equivalent to
31740 // LHS = VECTOR_SHUFFLE A, B, LMask
31741 // RHS = VECTOR_SHUFFLE A, B, RMask
31742 // Check that the masks correspond to performing a horizontal operation.
31743 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
31744 for (unsigned i = 0; i != NumLaneElts; ++i) {
31745 int LIdx = LMask[i+l], RIdx = RMask[i+l];
31747 // Ignore any UNDEF components.
31748 if (LIdx < 0 || RIdx < 0 ||
31749 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
31750 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
31753 // Check that successive elements are being operated on. If not, this is
31754 // not a horizontal operation.
31755 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
31756 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
31757 if (!(LIdx == Index && RIdx == Index + 1) &&
31758 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
31763 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
31764 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
31768 /// Do target-specific dag combines on floating-point adds/subs.
31769 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
31770 const X86Subtarget &Subtarget) {
31771 EVT VT = N->getValueType(0);
31772 SDValue LHS = N->getOperand(0);
31773 SDValue RHS = N->getOperand(1);
31774 bool IsFadd = N->getOpcode() == ISD::FADD;
31775 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
31777 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
31778 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
31779 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
31780 isHorizontalBinOp(LHS, RHS, IsFadd)) {
31781 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
31782 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
31787 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
31789 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
31790 SmallVector<SDValue, 8> &Regs) {
31791 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
31792 Regs[0].getValueType() == MVT::v2i64));
31793 EVT OutVT = N->getValueType(0);
31794 EVT OutSVT = OutVT.getVectorElementType();
31795 EVT InVT = Regs[0].getValueType();
31796 EVT InSVT = InVT.getVectorElementType();
31799 // First, use mask to unset all bits that won't appear in the result.
31800 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
31801 "OutSVT can only be either i8 or i16.");
31803 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
31804 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
31805 for (auto &Reg : Regs)
31806 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
31808 MVT UnpackedVT, PackedVT;
31809 if (OutSVT == MVT::i8) {
31810 UnpackedVT = MVT::v8i16;
31811 PackedVT = MVT::v16i8;
31813 UnpackedVT = MVT::v4i32;
31814 PackedVT = MVT::v8i16;
31817 // In each iteration, truncate the type by a half size.
31818 auto RegNum = Regs.size();
31819 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
31820 j < e; j *= 2, RegNum /= 2) {
31821 for (unsigned i = 0; i < RegNum; i++)
31822 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
31823 for (unsigned i = 0; i < RegNum / 2; i++)
31824 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
31828 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
31829 // then extract a subvector as the result since v8i8 is not a legal type.
31830 if (OutVT == MVT::v8i8) {
31831 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
31832 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
31833 DAG.getIntPtrConstant(0, DL));
31835 } else if (RegNum > 1) {
31836 Regs.resize(RegNum);
31837 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
31842 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
31844 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
31845 SmallVector<SDValue, 8> &Regs) {
31846 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
31847 EVT OutVT = N->getValueType(0);
31850 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
31851 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
31852 for (auto &Reg : Regs) {
31853 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
31854 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
31857 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
31858 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
31861 if (Regs.size() > 2) {
31862 Regs.resize(Regs.size() / 2);
31863 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
31868 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
31869 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
31870 /// legalization the truncation will be translated into a BUILD_VECTOR with each
31871 /// element that is extracted from a vector and then truncated, and it is
31872 /// difficult to do this optimization based on them.
31873 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
31874 const X86Subtarget &Subtarget) {
31875 EVT OutVT = N->getValueType(0);
31876 if (!OutVT.isVector())
31879 SDValue In = N->getOperand(0);
31880 if (!In.getValueType().isSimple())
31883 EVT InVT = In.getValueType();
31884 unsigned NumElems = OutVT.getVectorNumElements();
31886 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
31887 // SSE2, and we need to take care of it specially.
31888 // AVX512 provides vpmovdb.
31889 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
31892 EVT OutSVT = OutVT.getVectorElementType();
31893 EVT InSVT = InVT.getVectorElementType();
31894 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
31895 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
31899 // SSSE3's pshufb results in less instructions in the cases below.
31900 if (Subtarget.hasSSSE3() && NumElems == 8 &&
31901 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
31902 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
31907 // Split a long vector into vectors of legal type.
31908 unsigned RegNum = InVT.getSizeInBits() / 128;
31909 SmallVector<SDValue, 8> SubVec(RegNum);
31910 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
31911 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
31913 for (unsigned i = 0; i < RegNum; i++)
31914 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
31915 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
31917 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
31918 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
31919 // truncate 2 x v4i32 to v8i16.
31920 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
31921 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
31922 else if (InSVT == MVT::i32)
31923 return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
31928 /// This function transforms vector truncation of 'all or none' bits values.
31929 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
31930 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
31932 const X86Subtarget &Subtarget) {
31933 // Requires SSE2 but AVX512 has fast truncate.
31934 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
31937 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
31940 SDValue In = N->getOperand(0);
31941 if (!In.getValueType().isSimple())
31944 MVT VT = N->getValueType(0).getSimpleVT();
31945 MVT SVT = VT.getScalarType();
31947 MVT InVT = In.getValueType().getSimpleVT();
31948 MVT InSVT = InVT.getScalarType();
31950 // Use PACKSS if the input is a splatted sign bit.
31951 // e.g. Comparison result, sext_in_reg, etc.
31952 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
31953 if (NumSignBits != InSVT.getSizeInBits())
31956 // Check we have a truncation suited for PACKSS.
31957 if (!VT.is128BitVector() && !VT.is256BitVector())
31959 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
31961 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
31964 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
31967 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
31968 const X86Subtarget &Subtarget) {
31969 EVT VT = N->getValueType(0);
31970 SDValue Src = N->getOperand(0);
31973 // Try to detect AVG pattern first.
31974 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
31977 // The bitcast source is a direct mmx result.
31978 // Detect bitcasts between i32 to x86mmx
31979 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
31980 SDValue BCSrc = Src.getOperand(0);
31981 if (BCSrc.getValueType() == MVT::x86mmx)
31982 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
31985 // Try to truncate extended sign bits with PACKSS.
31986 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
31989 return combineVectorTruncation(N, DAG, Subtarget);
31992 /// Returns the negated value if the node \p N flips sign of FP value.
31994 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
31995 /// AVX512F does not have FXOR, so FNEG is lowered as
31996 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
31997 /// In this case we go though all bitcasts.
31998 static SDValue isFNEG(SDNode *N) {
31999 if (N->getOpcode() == ISD::FNEG)
32000 return N->getOperand(0);
32002 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
32003 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
32006 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
32007 if (!Op1.getValueType().isFloatingPoint())
32010 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
32012 unsigned EltBits = Op1.getScalarValueSizeInBits();
32013 auto isSignBitValue = [&](const ConstantFP *C) {
32014 return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
32017 // There is more than one way to represent the same constant on
32018 // the different X86 targets. The type of the node may also depend on size.
32019 // - load scalar value and broadcast
32020 // - BUILD_VECTOR node
32021 // - load from a constant pool.
32022 // We check all variants here.
32023 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
32024 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
32025 if (isSignBitValue(cast<ConstantFP>(C)))
32028 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
32029 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
32030 if (isSignBitValue(CN->getConstantFPValue()))
32033 } else if (auto *C = getTargetConstantFromNode(Op1)) {
32034 if (C->getType()->isVectorTy()) {
32035 if (auto *SplatV = C->getSplatValue())
32036 if (isSignBitValue(cast<ConstantFP>(SplatV)))
32038 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
32039 if (isSignBitValue(FPConst))
32045 /// Do target-specific dag combines on floating point negations.
32046 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
32047 const X86Subtarget &Subtarget) {
32048 EVT OrigVT = N->getValueType(0);
32049 SDValue Arg = isFNEG(N);
32050 assert(Arg.getNode() && "N is expected to be an FNEG node");
32052 EVT VT = Arg.getValueType();
32053 EVT SVT = VT.getScalarType();
32056 // Let legalize expand this if it isn't a legal type yet.
32057 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32060 // If we're negating a FMUL node on a target with FMA, then we can avoid the
32061 // use of a constant by performing (-0 - A*B) instead.
32062 // FIXME: Check rounding control flags as well once it becomes available.
32063 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
32064 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
32065 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
32066 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
32067 Arg.getOperand(1), Zero);
32068 return DAG.getBitcast(OrigVT, NewNode);
32071 // If we're negating an FMA node, then we can adjust the
32072 // instruction to include the extra negation.
32073 unsigned NewOpcode = 0;
32074 if (Arg.hasOneUse()) {
32075 switch (Arg.getOpcode()) {
32076 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
32077 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
32078 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
32079 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
32080 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
32081 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
32082 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
32083 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
32084 // We can't handle scalar intrinsic node here because it would only
32085 // invert one element and not the whole vector. But we could try to handle
32086 // a negation of the lower element only.
32090 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
32091 Arg.getNode()->ops()));
32096 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
32097 const X86Subtarget &Subtarget) {
32098 MVT VT = N->getSimpleValueType(0);
32099 // If we have integer vector types available, use the integer opcodes.
32100 if (VT.isVector() && Subtarget.hasSSE2()) {
32103 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
32105 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
32106 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
32107 unsigned IntOpcode;
32108 switch (N->getOpcode()) {
32109 default: llvm_unreachable("Unexpected FP logic op");
32110 case X86ISD::FOR: IntOpcode = ISD::OR; break;
32111 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
32112 case X86ISD::FAND: IntOpcode = ISD::AND; break;
32113 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
32115 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
32116 return DAG.getBitcast(VT, IntOp);
32121 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
32122 TargetLowering::DAGCombinerInfo &DCI,
32123 const X86Subtarget &Subtarget) {
32124 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
32127 if (DCI.isBeforeLegalizeOps())
32130 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
32133 if (Subtarget.hasCMov())
32134 if (SDValue RV = combineIntegerAbs(N, DAG))
32137 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32141 return combineFneg(N, DAG, Subtarget);
32146 static bool isNullFPScalarOrVectorConst(SDValue V) {
32147 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
32150 /// If a value is a scalar FP zero or a vector FP zero (potentially including
32151 /// undefined elements), return a zero constant that may be used to fold away
32152 /// that value. In the case of a vector, the returned constant will not contain
32153 /// undefined elements even if the input parameter does. This makes it suitable
32154 /// to be used as a replacement operand with operations (eg, bitwise-and) where
32155 /// an undef should not propagate.
32156 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
32157 const X86Subtarget &Subtarget) {
32158 if (!isNullFPScalarOrVectorConst(V))
32161 if (V.getValueType().isVector())
32162 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
32167 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
32168 const X86Subtarget &Subtarget) {
32169 SDValue N0 = N->getOperand(0);
32170 SDValue N1 = N->getOperand(1);
32171 EVT VT = N->getValueType(0);
32174 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
32175 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
32176 (VT == MVT::f64 && Subtarget.hasSSE2())))
32179 auto isAllOnesConstantFP = [](SDValue V) {
32180 auto *C = dyn_cast<ConstantFPSDNode>(V);
32181 return C && C->getConstantFPValue()->isAllOnesValue();
32184 // fand (fxor X, -1), Y --> fandn X, Y
32185 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
32186 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
32188 // fand X, (fxor Y, -1) --> fandn Y, X
32189 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
32190 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
32195 /// Do target-specific dag combines on X86ISD::FAND nodes.
32196 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
32197 const X86Subtarget &Subtarget) {
32198 // FAND(0.0, x) -> 0.0
32199 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
32202 // FAND(x, 0.0) -> 0.0
32203 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32206 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
32209 return lowerX86FPLogicOp(N, DAG, Subtarget);
32212 /// Do target-specific dag combines on X86ISD::FANDN nodes.
32213 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
32214 const X86Subtarget &Subtarget) {
32215 // FANDN(0.0, x) -> x
32216 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32217 return N->getOperand(1);
32219 // FANDN(x, 0.0) -> 0.0
32220 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32223 return lowerX86FPLogicOp(N, DAG, Subtarget);
32226 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
32227 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
32228 const X86Subtarget &Subtarget) {
32229 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
32231 // F[X]OR(0.0, x) -> x
32232 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32233 return N->getOperand(1);
32235 // F[X]OR(x, 0.0) -> x
32236 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
32237 return N->getOperand(0);
32240 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
32243 return lowerX86FPLogicOp(N, DAG, Subtarget);
32246 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
32247 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
32248 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
32250 // Only perform optimizations if UnsafeMath is used.
32251 if (!DAG.getTarget().Options.UnsafeFPMath)
32254 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
32255 // into FMINC and FMAXC, which are Commutative operations.
32256 unsigned NewOp = 0;
32257 switch (N->getOpcode()) {
32258 default: llvm_unreachable("unknown opcode");
32259 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
32260 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
32263 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
32264 N->getOperand(0), N->getOperand(1));
32267 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
32268 const X86Subtarget &Subtarget) {
32269 if (Subtarget.useSoftFloat())
32272 // TODO: Check for global or instruction-level "nnan". In that case, we
32273 // should be able to lower to FMAX/FMIN alone.
32274 // TODO: If an operand is already known to be a NaN or not a NaN, this
32275 // should be an optional swap and FMAX/FMIN.
32277 EVT VT = N->getValueType(0);
32278 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
32279 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
32280 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
32283 // This takes at least 3 instructions, so favor a library call when operating
32284 // on a scalar and minimizing code size.
32285 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
32288 SDValue Op0 = N->getOperand(0);
32289 SDValue Op1 = N->getOperand(1);
32291 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
32292 DAG.getDataLayout(), *DAG.getContext(), VT);
32294 // There are 4 possibilities involving NaN inputs, and these are the required
32298 // ----------------
32299 // Num | Max | Op0 |
32300 // Op0 ----------------
32301 // NaN | Op1 | NaN |
32302 // ----------------
32304 // The SSE FP max/min instructions were not designed for this case, but rather
32306 // Min = Op1 < Op0 ? Op1 : Op0
32307 // Max = Op1 > Op0 ? Op1 : Op0
32309 // So they always return Op0 if either input is a NaN. However, we can still
32310 // use those instructions for fmaxnum by selecting away a NaN input.
32312 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
32313 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
32314 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
32315 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
32317 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
32318 // are NaN, the NaN value of Op1 is the result.
32319 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
32320 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
32323 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
32324 TargetLowering::DAGCombinerInfo &DCI) {
32325 // BT ignores high bits in the bit index operand.
32326 SDValue Op1 = N->getOperand(1);
32327 if (Op1.hasOneUse()) {
32328 unsigned BitWidth = Op1.getValueSizeInBits();
32329 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
32330 APInt KnownZero, KnownOne;
32331 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32332 !DCI.isBeforeLegalizeOps());
32333 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32334 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
32335 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
32336 DCI.CommitTargetLoweringOpt(TLO);
32341 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
32342 const X86Subtarget &Subtarget) {
32343 EVT VT = N->getValueType(0);
32344 if (!VT.isVector())
32347 SDValue N0 = N->getOperand(0);
32348 SDValue N1 = N->getOperand(1);
32349 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
32352 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
32353 // both SSE and AVX2 since there is no sign-extended shift right
32354 // operation on a vector with 64-bit elements.
32355 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
32356 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
32357 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
32358 N0.getOpcode() == ISD::SIGN_EXTEND)) {
32359 SDValue N00 = N0.getOperand(0);
32361 // EXTLOAD has a better solution on AVX2,
32362 // it may be replaced with X86ISD::VSEXT node.
32363 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
32364 if (!ISD::isNormalLoad(N00.getNode()))
32367 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
32368 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
32370 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
32376 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
32377 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
32378 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
32379 /// opportunities to combine math ops, use an LEA, or use a complex addressing
32380 /// mode. This can eliminate extend, add, and shift instructions.
32381 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
32382 const X86Subtarget &Subtarget) {
32383 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
32384 Ext->getOpcode() != ISD::ZERO_EXTEND)
32387 // TODO: This should be valid for other integer types.
32388 EVT VT = Ext->getValueType(0);
32389 if (VT != MVT::i64)
32392 SDValue Add = Ext->getOperand(0);
32393 if (Add.getOpcode() != ISD::ADD)
32396 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
32397 bool NSW = Add->getFlags()->hasNoSignedWrap();
32398 bool NUW = Add->getFlags()->hasNoUnsignedWrap();
32400 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
32402 if ((Sext && !NSW) || (!Sext && !NUW))
32405 // Having a constant operand to the 'add' ensures that we are not increasing
32406 // the instruction count because the constant is extended for free below.
32407 // A constant operand can also become the displacement field of an LEA.
32408 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
32412 // Don't make the 'add' bigger if there's no hope of combining it with some
32413 // other 'add' or 'shl' instruction.
32414 // TODO: It may be profitable to generate simpler LEA instructions in place
32415 // of single 'add' instructions, but the cost model for selecting an LEA
32416 // currently has a high threshold.
32417 bool HasLEAPotential = false;
32418 for (auto *User : Ext->uses()) {
32419 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
32420 HasLEAPotential = true;
32424 if (!HasLEAPotential)
32427 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
32428 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
32429 SDValue AddOp0 = Add.getOperand(0);
32430 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
32431 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
32433 // The wider add is guaranteed to not wrap because both operands are
32436 Flags.setNoSignedWrap(NSW);
32437 Flags.setNoUnsignedWrap(NUW);
32438 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
32441 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
32442 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
32443 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
32444 /// extends from AH (which we otherwise need to do contortions to access).
32445 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
32446 SDValue N0 = N->getOperand(0);
32447 auto OpcodeN = N->getOpcode();
32448 auto OpcodeN0 = N0.getOpcode();
32449 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
32450 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
32453 EVT VT = N->getValueType(0);
32454 EVT InVT = N0.getValueType();
32455 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
32458 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
32459 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
32460 : X86ISD::UDIVREM8_ZEXT_HREG;
32461 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
32463 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
32464 return R.getValue(1);
32467 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
32468 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
32469 /// with UNDEFs) of the input to vectors of the same size as the target type
32470 /// which then extends the lowest elements.
32471 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
32472 TargetLowering::DAGCombinerInfo &DCI,
32473 const X86Subtarget &Subtarget) {
32474 unsigned Opcode = N->getOpcode();
32475 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
32477 if (!DCI.isBeforeLegalizeOps())
32479 if (!Subtarget.hasSSE2())
32482 SDValue N0 = N->getOperand(0);
32483 EVT VT = N->getValueType(0);
32484 EVT SVT = VT.getScalarType();
32485 EVT InVT = N0.getValueType();
32486 EVT InSVT = InVT.getScalarType();
32488 // Input type must be a vector and we must be extending legal integer types.
32489 if (!VT.isVector())
32491 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
32493 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
32496 // On AVX2+ targets, if the input/output types are both legal then we will be
32497 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
32498 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
32499 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
32504 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
32505 EVT InVT = N.getValueType();
32506 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
32507 Size / InVT.getScalarSizeInBits());
32508 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
32509 DAG.getUNDEF(InVT));
32511 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
32514 // If target-size is less than 128-bits, extend to a type that would extend
32515 // to 128 bits, extend that and extract the original target vector.
32516 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
32517 unsigned Scale = 128 / VT.getSizeInBits();
32519 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
32520 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
32521 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
32522 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
32523 DAG.getIntPtrConstant(0, DL));
32526 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
32527 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
32528 // Also use this if we don't have SSE41 to allow the legalizer do its job.
32529 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
32530 (VT.is256BitVector() && Subtarget.hasInt256()) ||
32531 (VT.is512BitVector() && Subtarget.hasAVX512())) {
32532 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
32533 return Opcode == ISD::SIGN_EXTEND
32534 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
32535 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
32538 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
32539 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
32540 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
32541 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
32542 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
32544 SmallVector<SDValue, 8> Opnds;
32545 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
32546 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
32547 DAG.getIntPtrConstant(Offset, DL));
32548 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
32549 SrcVec = Opcode == ISD::SIGN_EXTEND
32550 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
32551 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
32552 Opnds.push_back(SrcVec);
32554 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
32557 // On pre-AVX2 targets, split into 128-bit nodes of
32558 // ISD::*_EXTEND_VECTOR_INREG.
32559 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
32560 return SplitAndExtendInReg(128);
32562 // On pre-AVX512 targets, split into 256-bit nodes of
32563 // ISD::*_EXTEND_VECTOR_INREG.
32564 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
32565 return SplitAndExtendInReg(256);
32570 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
32571 TargetLowering::DAGCombinerInfo &DCI,
32572 const X86Subtarget &Subtarget) {
32573 SDValue N0 = N->getOperand(0);
32574 EVT VT = N->getValueType(0);
32575 EVT InVT = N0.getValueType();
32578 if (SDValue DivRem8 = getDivRem8(N, DAG))
32581 if (!DCI.isBeforeLegalizeOps()) {
32582 if (InVT == MVT::i1) {
32583 SDValue Zero = DAG.getConstant(0, DL, VT);
32585 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
32586 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
32591 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
32594 if (Subtarget.hasAVX() && VT.is256BitVector())
32595 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
32598 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
32604 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
32605 const X86Subtarget &Subtarget) {
32607 EVT VT = N->getValueType(0);
32609 // Let legalize expand this if it isn't a legal type yet.
32610 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32613 EVT ScalarVT = VT.getScalarType();
32614 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
32617 SDValue A = N->getOperand(0);
32618 SDValue B = N->getOperand(1);
32619 SDValue C = N->getOperand(2);
32621 auto invertIfNegative = [](SDValue &V) {
32622 if (SDValue NegVal = isFNEG(V.getNode())) {
32629 // Do not convert the passthru input of scalar intrinsics.
32630 // FIXME: We could allow negations of the lower element only.
32631 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
32632 bool NegB = invertIfNegative(B);
32633 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
32635 // Negative multiplication when NegA xor NegB
32636 bool NegMul = (NegA != NegB);
32638 unsigned NewOpcode;
32640 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
32642 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
32645 if (N->getOpcode() == X86ISD::FMADD_RND) {
32646 switch (NewOpcode) {
32647 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
32648 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
32649 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
32650 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
32652 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
32653 switch (NewOpcode) {
32654 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
32655 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
32656 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
32657 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
32659 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
32660 switch (NewOpcode) {
32661 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
32662 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
32663 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
32664 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
32667 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
32668 "Unexpected opcode!");
32669 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
32672 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
32675 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
32676 TargetLowering::DAGCombinerInfo &DCI,
32677 const X86Subtarget &Subtarget) {
32678 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
32679 // (and (i32 x86isd::setcc_carry), 1)
32680 // This eliminates the zext. This transformation is necessary because
32681 // ISD::SETCC is always legalized to i8.
32683 SDValue N0 = N->getOperand(0);
32684 EVT VT = N->getValueType(0);
32686 if (N0.getOpcode() == ISD::AND &&
32688 N0.getOperand(0).hasOneUse()) {
32689 SDValue N00 = N0.getOperand(0);
32690 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32691 if (!isOneConstant(N0.getOperand(1)))
32693 return DAG.getNode(ISD::AND, dl, VT,
32694 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
32695 N00.getOperand(0), N00.getOperand(1)),
32696 DAG.getConstant(1, dl, VT));
32700 if (N0.getOpcode() == ISD::TRUNCATE &&
32702 N0.getOperand(0).hasOneUse()) {
32703 SDValue N00 = N0.getOperand(0);
32704 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32705 return DAG.getNode(ISD::AND, dl, VT,
32706 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
32707 N00.getOperand(0), N00.getOperand(1)),
32708 DAG.getConstant(1, dl, VT));
32712 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
32715 if (VT.is256BitVector())
32716 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
32719 if (SDValue DivRem8 = getDivRem8(N, DAG))
32722 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
32725 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
32731 /// Optimize x == -y --> x+y == 0
32732 /// x != -y --> x+y != 0
32733 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
32734 const X86Subtarget &Subtarget) {
32735 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
32736 SDValue LHS = N->getOperand(0);
32737 SDValue RHS = N->getOperand(1);
32738 EVT VT = N->getValueType(0);
32741 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
32742 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
32743 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
32744 LHS.getOperand(1));
32745 return DAG.getSetCC(DL, N->getValueType(0), addV,
32746 DAG.getConstant(0, DL, addV.getValueType()), CC);
32748 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
32749 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
32750 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
32751 RHS.getOperand(1));
32752 return DAG.getSetCC(DL, N->getValueType(0), addV,
32753 DAG.getConstant(0, DL, addV.getValueType()), CC);
32756 if (VT.getScalarType() == MVT::i1 &&
32757 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
32759 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
32760 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
32761 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
32763 if (!IsSEXT0 || !IsVZero1) {
32764 // Swap the operands and update the condition code.
32765 std::swap(LHS, RHS);
32766 CC = ISD::getSetCCSwappedOperands(CC);
32768 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
32769 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
32770 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
32773 if (IsSEXT0 && IsVZero1) {
32774 assert(VT == LHS.getOperand(0).getValueType() &&
32775 "Uexpected operand type");
32776 if (CC == ISD::SETGT)
32777 return DAG.getConstant(0, DL, VT);
32778 if (CC == ISD::SETLE)
32779 return DAG.getConstant(1, DL, VT);
32780 if (CC == ISD::SETEQ || CC == ISD::SETGE)
32781 return DAG.getNOT(DL, LHS.getOperand(0), VT);
32783 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
32784 "Unexpected condition code!");
32785 return LHS.getOperand(0);
32789 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
32790 // to avoid scalarization via legalization because v4i32 is not a legal type.
32791 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
32792 LHS.getValueType() == MVT::v4f32)
32793 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
32798 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
32800 // Gather and Scatter instructions use k-registers for masks. The type of
32801 // the masks is v*i1. So the mask will be truncated anyway.
32802 // The SIGN_EXTEND_INREG my be dropped.
32803 SDValue Mask = N->getOperand(2);
32804 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
32805 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
32806 NewOps[2] = Mask.getOperand(0);
32807 DAG.UpdateNodeOperands(N, NewOps);
32812 // Helper function of performSETCCCombine. It is to materialize "setb reg"
32813 // as "sbb reg,reg", since it can be extended without zext and produces
32814 // an all-ones bit which is more useful than 0/1 in some cases.
32815 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
32816 SelectionDAG &DAG, MVT VT) {
32818 return DAG.getNode(ISD::AND, DL, VT,
32819 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
32820 DAG.getConstant(X86::COND_B, DL, MVT::i8),
32822 DAG.getConstant(1, DL, VT));
32823 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
32824 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
32825 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
32826 DAG.getConstant(X86::COND_B, DL, MVT::i8),
32830 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
32831 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
32832 TargetLowering::DAGCombinerInfo &DCI,
32833 const X86Subtarget &Subtarget) {
32835 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
32836 SDValue EFLAGS = N->getOperand(1);
32838 if (CC == X86::COND_A) {
32839 // Try to convert COND_A into COND_B in an attempt to facilitate
32840 // materializing "setb reg".
32842 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
32843 // cannot take an immediate as its first operand.
32845 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
32846 EFLAGS.getValueType().isInteger() &&
32847 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
32848 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
32849 EFLAGS.getNode()->getVTList(),
32850 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
32851 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
32852 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
32856 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
32857 // a zext and produces an all-ones bit which is more useful than 0/1 in some
32859 if (CC == X86::COND_B)
32860 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
32862 // Try to simplify the EFLAGS and condition code operands.
32863 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
32864 return getSETCC(CC, Flags, DL, DAG);
32869 /// Optimize branch condition evaluation.
32870 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
32871 TargetLowering::DAGCombinerInfo &DCI,
32872 const X86Subtarget &Subtarget) {
32874 SDValue EFLAGS = N->getOperand(3);
32875 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
32877 // Try to simplify the EFLAGS and condition code operands.
32878 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
32879 // RAUW them under us.
32880 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
32881 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
32882 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
32883 N->getOperand(1), Cond, Flags);
32889 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
32890 SelectionDAG &DAG) {
32891 // Take advantage of vector comparisons producing 0 or -1 in each lane to
32892 // optimize away operation when it's from a constant.
32894 // The general transformation is:
32895 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
32896 // AND(VECTOR_CMP(x,y), constant2)
32897 // constant2 = UNARYOP(constant)
32899 // Early exit if this isn't a vector operation, the operand of the
32900 // unary operation isn't a bitwise AND, or if the sizes of the operations
32901 // aren't the same.
32902 EVT VT = N->getValueType(0);
32903 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
32904 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
32905 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
32908 // Now check that the other operand of the AND is a constant. We could
32909 // make the transformation for non-constant splats as well, but it's unclear
32910 // that would be a benefit as it would not eliminate any operations, just
32911 // perform one more step in scalar code before moving to the vector unit.
32912 if (BuildVectorSDNode *BV =
32913 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
32914 // Bail out if the vector isn't a constant.
32915 if (!BV->isConstant())
32918 // Everything checks out. Build up the new and improved node.
32920 EVT IntVT = BV->getValueType(0);
32921 // Create a new constant of the appropriate type for the transformed
32923 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
32924 // The AND node needs bitcasts to/from an integer vector type around it.
32925 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
32926 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
32927 N->getOperand(0)->getOperand(0), MaskConst);
32928 SDValue Res = DAG.getBitcast(VT, NewAnd);
32935 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
32936 const X86Subtarget &Subtarget) {
32937 SDValue Op0 = N->getOperand(0);
32938 EVT VT = N->getValueType(0);
32939 EVT InVT = Op0.getValueType();
32940 EVT InSVT = InVT.getScalarType();
32941 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32943 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
32944 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
32945 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
32947 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
32948 InVT.getVectorNumElements());
32949 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
32951 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
32952 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
32954 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
32957 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
32958 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
32959 // the optimization here.
32960 if (DAG.SignBitIsZero(Op0))
32961 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
32966 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
32967 const X86Subtarget &Subtarget) {
32968 // First try to optimize away the conversion entirely when it's
32969 // conditionally from a constant. Vectors only.
32970 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
32973 // Now move on to more general possibilities.
32974 SDValue Op0 = N->getOperand(0);
32975 EVT VT = N->getValueType(0);
32976 EVT InVT = Op0.getValueType();
32977 EVT InSVT = InVT.getScalarType();
32979 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
32980 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
32981 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
32982 if (InVT.isVector() &&
32983 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
32984 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
32986 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
32987 InVT.getVectorNumElements());
32988 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
32989 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
32992 // Without AVX512DQ we only support i64 to float scalar conversion. For both
32993 // vectors and scalars, see if we know that the upper bits are all the sign
32994 // bit, in which case we can truncate the input to i32 and convert from that.
32995 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
32996 unsigned BitWidth = InVT.getScalarSizeInBits();
32997 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
32998 if (NumSignBits >= (BitWidth - 31)) {
32999 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
33000 if (InVT.isVector())
33001 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
33002 InVT.getVectorNumElements());
33004 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
33005 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
33009 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
33010 // a 32-bit target where SSE doesn't support i64->FP operations.
33011 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
33012 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
33013 EVT LdVT = Ld->getValueType(0);
33015 // This transformation is not supported if the result type is f16 or f128.
33016 if (VT == MVT::f16 || VT == MVT::f128)
33019 if (!Ld->isVolatile() && !VT.isVector() &&
33020 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
33021 !Subtarget.is64Bit() && LdVT == MVT::i64) {
33022 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
33023 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
33024 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
33031 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
33032 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
33033 X86TargetLowering::DAGCombinerInfo &DCI) {
33034 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
33035 // the result is either zero or one (depending on the input carry bit).
33036 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
33037 if (X86::isZeroNode(N->getOperand(0)) &&
33038 X86::isZeroNode(N->getOperand(1)) &&
33039 // We don't have a good way to replace an EFLAGS use, so only do this when
33041 SDValue(N, 1).use_empty()) {
33043 EVT VT = N->getValueType(0);
33044 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
33045 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
33046 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
33047 DAG.getConstant(X86::COND_B, DL,
33050 DAG.getConstant(1, DL, VT));
33051 return DCI.CombineTo(N, Res1, CarryOut);
33057 /// fold (add Y, (sete X, 0)) -> adc 0, Y
33058 /// (add Y, (setne X, 0)) -> sbb -1, Y
33059 /// (sub (sete X, 0), Y) -> sbb 0, Y
33060 /// (sub (setne X, 0), Y) -> adc -1, Y
33061 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
33064 // Look through ZExts.
33065 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
33066 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
33069 SDValue SetCC = Ext.getOperand(0);
33070 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
33073 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
33074 if (CC != X86::COND_E && CC != X86::COND_NE)
33077 SDValue Cmp = SetCC.getOperand(1);
33078 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
33079 !X86::isZeroNode(Cmp.getOperand(1)) ||
33080 !Cmp.getOperand(0).getValueType().isInteger())
33083 SDValue CmpOp0 = Cmp.getOperand(0);
33084 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
33085 DAG.getConstant(1, DL, CmpOp0.getValueType()));
33087 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
33088 if (CC == X86::COND_NE)
33089 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
33090 DL, OtherVal.getValueType(), OtherVal,
33091 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
33093 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
33094 DL, OtherVal.getValueType(), OtherVal,
33095 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
33098 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
33099 const X86Subtarget &Subtarget) {
33101 EVT VT = N->getValueType(0);
33102 SDValue Op0 = N->getOperand(0);
33103 SDValue Op1 = N->getOperand(1);
33105 // TODO: There's nothing special about i32, any integer type above i16 should
33106 // work just as well.
33107 if (!VT.isVector() || !VT.isSimple() ||
33108 !(VT.getVectorElementType() == MVT::i32))
33111 unsigned RegSize = 128;
33112 if (Subtarget.hasBWI())
33114 else if (Subtarget.hasAVX2())
33117 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
33118 // TODO: We should be able to handle larger vectors by splitting them before
33119 // feeding them into several SADs, and then reducing over those.
33120 if (VT.getSizeInBits() / 4 > RegSize)
33123 // We know N is a reduction add, which means one of its operands is a phi.
33124 // To match SAD, we need the other operand to be a vector select.
33125 SDValue SelectOp, Phi;
33126 if (Op0.getOpcode() == ISD::VSELECT) {
33129 } else if (Op1.getOpcode() == ISD::VSELECT) {
33135 // Check whether we have an abs-diff pattern feeding into the select.
33136 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
33139 // SAD pattern detected. Now build a SAD instruction and an addition for
33140 // reduction. Note that the number of elements of the result of SAD is less
33141 // than the number of elements of its input. Therefore, we could only update
33142 // part of elements in the reduction vector.
33143 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
33145 // The output of PSADBW is a vector of i64.
33146 // We need to turn the vector of i64 into a vector of i32.
33147 // If the reduction vector is at least as wide as the psadbw result, just
33148 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
33150 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
33151 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
33152 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
33154 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
33156 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
33157 // Update part of elements of the reduction vector. This is done by first
33158 // extracting a sub-vector from it, updating this sub-vector, and inserting
33160 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
33161 DAG.getIntPtrConstant(0, DL));
33162 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
33163 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
33164 DAG.getIntPtrConstant(0, DL));
33166 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
33169 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
33170 const X86Subtarget &Subtarget) {
33171 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
33172 if (Flags->hasVectorReduction()) {
33173 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
33176 EVT VT = N->getValueType(0);
33177 SDValue Op0 = N->getOperand(0);
33178 SDValue Op1 = N->getOperand(1);
33180 // Try to synthesize horizontal adds from adds of shuffles.
33181 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33182 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33183 isHorizontalBinOp(Op0, Op1, true))
33184 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
33186 return OptimizeConditionalInDecrement(N, DAG);
33189 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
33190 const X86Subtarget &Subtarget) {
33191 SDValue Op0 = N->getOperand(0);
33192 SDValue Op1 = N->getOperand(1);
33194 // X86 can't encode an immediate LHS of a sub. See if we can push the
33195 // negation into a preceding instruction.
33196 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
33197 // If the RHS of the sub is a XOR with one use and a constant, invert the
33198 // immediate. Then add one to the LHS of the sub so we can turn
33199 // X-Y -> X+~Y+1, saving one register.
33200 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
33201 isa<ConstantSDNode>(Op1.getOperand(1))) {
33202 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
33203 EVT VT = Op0.getValueType();
33204 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
33206 DAG.getConstant(~XorC, SDLoc(Op1), VT));
33207 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
33208 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
33212 // Try to synthesize horizontal adds from adds of shuffles.
33213 EVT VT = N->getValueType(0);
33214 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33215 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33216 isHorizontalBinOp(Op0, Op1, true))
33217 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
33219 return OptimizeConditionalInDecrement(N, DAG);
33222 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
33223 TargetLowering::DAGCombinerInfo &DCI,
33224 const X86Subtarget &Subtarget) {
33226 unsigned Opcode = N->getOpcode();
33227 MVT VT = N->getSimpleValueType(0);
33228 MVT SVT = VT.getVectorElementType();
33229 SDValue Op = N->getOperand(0);
33230 MVT OpVT = Op.getSimpleValueType();
33231 MVT OpEltVT = OpVT.getVectorElementType();
33232 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
33234 // Perform any constant folding.
33235 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
33236 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
33237 unsigned NumDstElts = VT.getVectorNumElements();
33238 SmallBitVector Undefs(NumDstElts, false);
33239 SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
33240 for (unsigned i = 0; i != NumDstElts; ++i) {
33241 SDValue OpElt = Op.getOperand(i);
33242 if (OpElt.getOpcode() == ISD::UNDEF) {
33246 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
33247 Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
33248 : Cst.sextOrTrunc(SVT.getSizeInBits());
33250 return getConstVector(Vals, Undefs, VT, DAG, DL);
33253 // (vzext (bitcast (vzext (x)) -> (vzext x)
33254 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
33255 SDValue V = peekThroughBitcasts(Op);
33256 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
33257 MVT InnerVT = V.getSimpleValueType();
33258 MVT InnerEltVT = InnerVT.getVectorElementType();
33260 // If the element sizes match exactly, we can just do one larger vzext. This
33261 // is always an exact type match as vzext operates on integer types.
33262 if (OpEltVT == InnerEltVT) {
33263 assert(OpVT == InnerVT && "Types must match for vzext!");
33264 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
33267 // The only other way we can combine them is if only a single element of the
33268 // inner vzext is used in the input to the outer vzext.
33269 if (InnerEltVT.getSizeInBits() < InputBits)
33272 // In this case, the inner vzext is completely dead because we're going to
33273 // only look at bits inside of the low element. Just do the outer vzext on
33274 // a bitcast of the input to the inner.
33275 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
33278 // Check if we can bypass extracting and re-inserting an element of an input
33279 // vector. Essentially:
33280 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
33281 // TODO: Add X86ISD::VSEXT support
33282 if (Opcode == X86ISD::VZEXT &&
33283 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
33284 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33285 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
33286 SDValue ExtractedV = V.getOperand(0);
33287 SDValue OrigV = ExtractedV.getOperand(0);
33288 if (isNullConstant(ExtractedV.getOperand(1))) {
33289 MVT OrigVT = OrigV.getSimpleValueType();
33290 // Extract a subvector if necessary...
33291 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
33292 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
33293 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
33294 OrigVT.getVectorNumElements() / Ratio);
33295 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
33296 DAG.getIntPtrConstant(0, DL));
33298 Op = DAG.getBitcast(OpVT, OrigV);
33299 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
33306 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
33307 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
33308 const X86Subtarget &Subtarget) {
33309 SDValue Chain = N->getOperand(0);
33310 SDValue LHS = N->getOperand(1);
33311 SDValue RHS = N->getOperand(2);
33312 MVT VT = RHS.getSimpleValueType();
33315 auto *C = dyn_cast<ConstantSDNode>(RHS);
33316 if (!C || C->getZExtValue() != 1)
33319 RHS = DAG.getConstant(-1, DL, VT);
33320 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33321 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
33322 DAG.getVTList(MVT::i32, MVT::Other),
33323 {Chain, LHS, RHS}, VT, MMO);
33326 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
33327 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
33328 SDValue Op0 = N->getOperand(0);
33329 SDValue Op1 = N->getOperand(1);
33331 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
33334 EVT VT = N->getValueType(0);
33337 return DAG.getNode(X86ISD::TESTM, DL, VT,
33338 Op0->getOperand(0), Op0->getOperand(1));
33341 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
33342 const X86Subtarget &Subtarget) {
33343 MVT VT = N->getSimpleValueType(0);
33346 if (N->getOperand(0) == N->getOperand(1)) {
33347 if (N->getOpcode() == X86ISD::PCMPEQ)
33348 return getOnesVector(VT, Subtarget, DAG, DL);
33349 if (N->getOpcode() == X86ISD::PCMPGT)
33350 return getZeroVector(VT, Subtarget, DAG, DL);
33357 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
33358 DAGCombinerInfo &DCI) const {
33359 SelectionDAG &DAG = DCI.DAG;
33360 switch (N->getOpcode()) {
33362 case ISD::EXTRACT_VECTOR_ELT:
33363 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
33366 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
33367 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
33368 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
33369 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
33370 case ISD::SUB: return combineSub(N, DAG, Subtarget);
33371 case X86ISD::ADC: return combineADC(N, DAG, DCI);
33372 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
33375 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
33376 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
33377 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
33378 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
33379 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
33380 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
33381 case ISD::STORE: return combineStore(N, DAG, Subtarget);
33382 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
33383 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
33384 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
33386 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
33387 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
33388 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
33389 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
33390 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
33392 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
33394 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
33396 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
33397 case X86ISD::BT: return combineBT(N, DAG, DCI);
33398 case ISD::ANY_EXTEND:
33399 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
33400 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
33401 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
33402 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
33403 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
33404 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
33405 case X86ISD::VSHLI:
33406 case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
33407 case X86ISD::VSEXT:
33408 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
33409 case X86ISD::SHUFP: // Handle all target specific shuffles
33410 case X86ISD::INSERTPS:
33411 case X86ISD::PALIGNR:
33412 case X86ISD::VSHLDQ:
33413 case X86ISD::VSRLDQ:
33414 case X86ISD::BLENDI:
33415 case X86ISD::UNPCKH:
33416 case X86ISD::UNPCKL:
33417 case X86ISD::MOVHLPS:
33418 case X86ISD::MOVLHPS:
33419 case X86ISD::PSHUFB:
33420 case X86ISD::PSHUFD:
33421 case X86ISD::PSHUFHW:
33422 case X86ISD::PSHUFLW:
33423 case X86ISD::MOVSHDUP:
33424 case X86ISD::MOVSLDUP:
33425 case X86ISD::MOVDDUP:
33426 case X86ISD::MOVSS:
33427 case X86ISD::MOVSD:
33428 case X86ISD::VPPERM:
33429 case X86ISD::VPERMI:
33430 case X86ISD::VPERMV:
33431 case X86ISD::VPERMV3:
33432 case X86ISD::VPERMIV3:
33433 case X86ISD::VPERMIL2:
33434 case X86ISD::VPERMILPI:
33435 case X86ISD::VPERMILPV:
33436 case X86ISD::VPERM2X128:
33437 case X86ISD::VZEXT_MOVL:
33438 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
33439 case X86ISD::FMADD:
33440 case X86ISD::FMADD_RND:
33441 case X86ISD::FMADDS1_RND:
33442 case X86ISD::FMADDS3_RND:
33443 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
33445 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
33446 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
33447 case X86ISD::TESTM: return combineTestM(N, DAG);
33448 case X86ISD::PCMPEQ:
33449 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
33455 /// Return true if the target has native support for the specified value type
33456 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
33457 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
33458 /// some i16 instructions are slow.
33459 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
33460 if (!isTypeLegal(VT))
33462 if (VT != MVT::i16)
33469 case ISD::SIGN_EXTEND:
33470 case ISD::ZERO_EXTEND:
33471 case ISD::ANY_EXTEND:
33484 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
33485 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
33486 /// we don't adjust the stack we clobber the first frame index.
33487 /// See X86InstrInfo::copyPhysReg.
33488 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
33489 MachineFunction *MF) const {
33490 const MachineRegisterInfo &MRI = MF->getRegInfo();
33492 return any_of(MRI.reg_instructions(X86::EFLAGS),
33493 [](const MachineInstr &RI) { return RI.isCopy(); });
33496 /// This method query the target whether it is beneficial for dag combiner to
33497 /// promote the specified node. If true, it should return the desired promotion
33498 /// type by reference.
33499 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
33500 EVT VT = Op.getValueType();
33501 if (VT != MVT::i16)
33504 bool Promote = false;
33505 bool Commute = false;
33506 switch (Op.getOpcode()) {
33508 case ISD::SIGN_EXTEND:
33509 case ISD::ZERO_EXTEND:
33510 case ISD::ANY_EXTEND:
33515 SDValue N0 = Op.getOperand(0);
33516 // Look out for (store (shl (load), x)).
33517 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
33530 SDValue N0 = Op.getOperand(0);
33531 SDValue N1 = Op.getOperand(1);
33532 if (!Commute && MayFoldLoad(N1))
33534 // Avoid disabling potential load folding opportunities.
33535 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
33537 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
33547 //===----------------------------------------------------------------------===//
33548 // X86 Inline Assembly Support
33549 //===----------------------------------------------------------------------===//
33551 // Helper to match a string separated by whitespace.
33552 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
33553 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
33555 for (StringRef Piece : Pieces) {
33556 if (!S.startswith(Piece)) // Check if the piece matches.
33559 S = S.substr(Piece.size());
33560 StringRef::size_type Pos = S.find_first_not_of(" \t");
33561 if (Pos == 0) // We matched a prefix.
33570 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
33572 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
33573 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
33574 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
33575 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
33577 if (AsmPieces.size() == 3)
33579 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
33586 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
33587 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
33589 const std::string &AsmStr = IA->getAsmString();
33591 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
33592 if (!Ty || Ty->getBitWidth() % 16 != 0)
33595 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
33596 SmallVector<StringRef, 4> AsmPieces;
33597 SplitString(AsmStr, AsmPieces, ";\n");
33599 switch (AsmPieces.size()) {
33600 default: return false;
33602 // FIXME: this should verify that we are targeting a 486 or better. If not,
33603 // we will turn this bswap into something that will be lowered to logical
33604 // ops instead of emitting the bswap asm. For now, we don't support 486 or
33605 // lower so don't worry about this.
33607 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
33608 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
33609 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
33610 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
33611 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
33612 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
33613 // No need to check constraints, nothing other than the equivalent of
33614 // "=r,0" would be valid here.
33615 return IntrinsicLowering::LowerToByteSwap(CI);
33618 // rorw $$8, ${0:w} --> llvm.bswap.i16
33619 if (CI->getType()->isIntegerTy(16) &&
33620 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33621 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
33622 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
33624 StringRef ConstraintsStr = IA->getConstraintString();
33625 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33626 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33627 if (clobbersFlagRegisters(AsmPieces))
33628 return IntrinsicLowering::LowerToByteSwap(CI);
33632 if (CI->getType()->isIntegerTy(32) &&
33633 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33634 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
33635 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
33636 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
33638 StringRef ConstraintsStr = IA->getConstraintString();
33639 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33640 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33641 if (clobbersFlagRegisters(AsmPieces))
33642 return IntrinsicLowering::LowerToByteSwap(CI);
33645 if (CI->getType()->isIntegerTy(64)) {
33646 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
33647 if (Constraints.size() >= 2 &&
33648 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
33649 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
33650 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
33651 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
33652 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
33653 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
33654 return IntrinsicLowering::LowerToByteSwap(CI);
33662 /// Given a constraint letter, return the type of constraint for this target.
33663 X86TargetLowering::ConstraintType
33664 X86TargetLowering::getConstraintType(StringRef Constraint) const {
33665 if (Constraint.size() == 1) {
33666 switch (Constraint[0]) {
33678 return C_RegisterClass;
33679 case 'k': // AVX512 masking registers.
33703 else if (Constraint.size() == 2) {
33704 switch (Constraint[0]) {
33708 switch (Constraint[1]) {
33716 return TargetLowering::getConstraintType(Constraint);
33719 /// Examine constraint type and operand type and determine a weight value.
33720 /// This object must already have been set up with the operand type
33721 /// and the current alternative constraint selected.
33722 TargetLowering::ConstraintWeight
33723 X86TargetLowering::getSingleConstraintMatchWeight(
33724 AsmOperandInfo &info, const char *constraint) const {
33725 ConstraintWeight weight = CW_Invalid;
33726 Value *CallOperandVal = info.CallOperandVal;
33727 // If we don't have a value, we can't do a match,
33728 // but allow it at the lowest weight.
33729 if (!CallOperandVal)
33731 Type *type = CallOperandVal->getType();
33732 // Look at the constraint type.
33733 switch (*constraint) {
33735 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
33746 if (CallOperandVal->getType()->isIntegerTy())
33747 weight = CW_SpecificReg;
33752 if (type->isFloatingPointTy())
33753 weight = CW_SpecificReg;
33756 if (type->isX86_MMXTy() && Subtarget.hasMMX())
33757 weight = CW_SpecificReg;
33760 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
33761 if (constraint[1] == 'k') {
33762 // Support for 'Yk' (similarly to the 'k' variant below).
33763 weight = CW_SpecificReg;
33766 // Else fall through (handle "Y" constraint).
33769 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
33770 weight = CW_Register;
33773 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
33774 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
33775 weight = CW_Register;
33778 // Enable conditional vector operations using %k<#> registers.
33779 weight = CW_SpecificReg;
33782 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
33783 if (C->getZExtValue() <= 31)
33784 weight = CW_Constant;
33788 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33789 if (C->getZExtValue() <= 63)
33790 weight = CW_Constant;
33794 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33795 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
33796 weight = CW_Constant;
33800 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33801 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
33802 weight = CW_Constant;
33806 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33807 if (C->getZExtValue() <= 3)
33808 weight = CW_Constant;
33812 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33813 if (C->getZExtValue() <= 0xff)
33814 weight = CW_Constant;
33819 if (isa<ConstantFP>(CallOperandVal)) {
33820 weight = CW_Constant;
33824 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33825 if ((C->getSExtValue() >= -0x80000000LL) &&
33826 (C->getSExtValue() <= 0x7fffffffLL))
33827 weight = CW_Constant;
33831 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33832 if (C->getZExtValue() <= 0xffffffff)
33833 weight = CW_Constant;
33840 /// Try to replace an X constraint, which matches anything, with another that
33841 /// has more specific requirements based on the type of the corresponding
33843 const char *X86TargetLowering::
33844 LowerXConstraint(EVT ConstraintVT) const {
33845 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
33846 // 'f' like normal targets.
33847 if (ConstraintVT.isFloatingPoint()) {
33848 if (Subtarget.hasSSE2())
33850 if (Subtarget.hasSSE1())
33854 return TargetLowering::LowerXConstraint(ConstraintVT);
33857 /// Lower the specified operand into the Ops vector.
33858 /// If it is invalid, don't add anything to Ops.
33859 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
33860 std::string &Constraint,
33861 std::vector<SDValue>&Ops,
33862 SelectionDAG &DAG) const {
33865 // Only support length 1 constraints for now.
33866 if (Constraint.length() > 1) return;
33868 char ConstraintLetter = Constraint[0];
33869 switch (ConstraintLetter) {
33872 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33873 if (C->getZExtValue() <= 31) {
33874 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
33875 Op.getValueType());
33881 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33882 if (C->getZExtValue() <= 63) {
33883 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
33884 Op.getValueType());
33890 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33891 if (isInt<8>(C->getSExtValue())) {
33892 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
33893 Op.getValueType());
33899 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33900 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
33901 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
33902 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
33903 Op.getValueType());
33909 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33910 if (C->getZExtValue() <= 3) {
33911 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
33912 Op.getValueType());
33918 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33919 if (C->getZExtValue() <= 255) {
33920 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
33921 Op.getValueType());
33927 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33928 if (C->getZExtValue() <= 127) {
33929 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
33930 Op.getValueType());
33936 // 32-bit signed value
33937 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33938 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
33939 C->getSExtValue())) {
33940 // Widen to 64 bits here to get it sign extended.
33941 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
33944 // FIXME gcc accepts some relocatable values here too, but only in certain
33945 // memory models; it's complicated.
33950 // 32-bit unsigned value
33951 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
33952 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
33953 C->getZExtValue())) {
33954 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
33955 Op.getValueType());
33959 // FIXME gcc accepts some relocatable values here too, but only in certain
33960 // memory models; it's complicated.
33964 // Literal immediates are always ok.
33965 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
33966 // Widen to 64 bits here to get it sign extended.
33967 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
33971 // In any sort of PIC mode addresses need to be computed at runtime by
33972 // adding in a register or some sort of table lookup. These can't
33973 // be used as immediates.
33974 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
33977 // If we are in non-pic codegen mode, we allow the address of a global (with
33978 // an optional displacement) to be used with 'i'.
33979 GlobalAddressSDNode *GA = nullptr;
33980 int64_t Offset = 0;
33982 // Match either (GA), (GA+C), (GA+C1+C2), etc.
33984 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
33985 Offset += GA->getOffset();
33987 } else if (Op.getOpcode() == ISD::ADD) {
33988 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
33989 Offset += C->getZExtValue();
33990 Op = Op.getOperand(0);
33993 } else if (Op.getOpcode() == ISD::SUB) {
33994 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
33995 Offset += -C->getZExtValue();
33996 Op = Op.getOperand(0);
34001 // Otherwise, this isn't something we can handle, reject it.
34005 const GlobalValue *GV = GA->getGlobal();
34006 // If we require an extra load to get this address, as in PIC mode, we
34007 // can't accept it.
34008 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
34011 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
34012 GA->getValueType(0), Offset);
34017 if (Result.getNode()) {
34018 Ops.push_back(Result);
34021 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
34024 /// Check if \p RC is a general purpose register class.
34025 /// I.e., GR* or one of their variant.
34026 static bool isGRClass(const TargetRegisterClass &RC) {
34027 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
34028 RC.hasSuperClassEq(&X86::GR16RegClass) ||
34029 RC.hasSuperClassEq(&X86::GR32RegClass) ||
34030 RC.hasSuperClassEq(&X86::GR64RegClass) ||
34031 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
34034 /// Check if \p RC is a vector register class.
34035 /// I.e., FR* / VR* or one of their variant.
34036 static bool isFRClass(const TargetRegisterClass &RC) {
34037 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
34038 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
34039 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
34040 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
34041 RC.hasSuperClassEq(&X86::VR512RegClass);
34044 std::pair<unsigned, const TargetRegisterClass *>
34045 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
34046 StringRef Constraint,
34048 // First, see if this is a constraint that directly corresponds to an LLVM
34050 if (Constraint.size() == 1) {
34051 // GCC Constraint Letters
34052 switch (Constraint[0]) {
34054 // TODO: Slight differences here in allocation order and leaving
34055 // RIP in the class. Do they matter any more here than they do
34056 // in the normal allocation?
34058 if (Subtarget.hasAVX512()) {
34059 // Only supported in AVX512 or later.
34060 switch (VT.SimpleTy) {
34063 return std::make_pair(0U, &X86::VK32RegClass);
34065 return std::make_pair(0U, &X86::VK16RegClass);
34067 return std::make_pair(0U, &X86::VK8RegClass);
34069 return std::make_pair(0U, &X86::VK1RegClass);
34071 return std::make_pair(0U, &X86::VK64RegClass);
34075 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
34076 if (Subtarget.is64Bit()) {
34077 if (VT == MVT::i32 || VT == MVT::f32)
34078 return std::make_pair(0U, &X86::GR32RegClass);
34079 if (VT == MVT::i16)
34080 return std::make_pair(0U, &X86::GR16RegClass);
34081 if (VT == MVT::i8 || VT == MVT::i1)
34082 return std::make_pair(0U, &X86::GR8RegClass);
34083 if (VT == MVT::i64 || VT == MVT::f64)
34084 return std::make_pair(0U, &X86::GR64RegClass);
34087 // 32-bit fallthrough
34088 case 'Q': // Q_REGS
34089 if (VT == MVT::i32 || VT == MVT::f32)
34090 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
34091 if (VT == MVT::i16)
34092 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
34093 if (VT == MVT::i8 || VT == MVT::i1)
34094 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
34095 if (VT == MVT::i64)
34096 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
34098 case 'r': // GENERAL_REGS
34099 case 'l': // INDEX_REGS
34100 if (VT == MVT::i8 || VT == MVT::i1)
34101 return std::make_pair(0U, &X86::GR8RegClass);
34102 if (VT == MVT::i16)
34103 return std::make_pair(0U, &X86::GR16RegClass);
34104 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
34105 return std::make_pair(0U, &X86::GR32RegClass);
34106 return std::make_pair(0U, &X86::GR64RegClass);
34107 case 'R': // LEGACY_REGS
34108 if (VT == MVT::i8 || VT == MVT::i1)
34109 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
34110 if (VT == MVT::i16)
34111 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
34112 if (VT == MVT::i32 || !Subtarget.is64Bit())
34113 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
34114 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
34115 case 'f': // FP Stack registers.
34116 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
34117 // value to the correct fpstack register class.
34118 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
34119 return std::make_pair(0U, &X86::RFP32RegClass);
34120 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
34121 return std::make_pair(0U, &X86::RFP64RegClass);
34122 return std::make_pair(0U, &X86::RFP80RegClass);
34123 case 'y': // MMX_REGS if MMX allowed.
34124 if (!Subtarget.hasMMX()) break;
34125 return std::make_pair(0U, &X86::VR64RegClass);
34126 case 'Y': // SSE_REGS if SSE2 allowed
34127 if (!Subtarget.hasSSE2()) break;
34130 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
34131 if (!Subtarget.hasSSE1()) break;
34132 bool VConstraint = (Constraint[0] == 'v');
34134 switch (VT.SimpleTy) {
34136 // Scalar SSE types.
34139 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
34140 return std::make_pair(0U, &X86::FR32XRegClass);
34141 return std::make_pair(0U, &X86::FR32RegClass);
34144 if (VConstraint && Subtarget.hasVLX())
34145 return std::make_pair(0U, &X86::FR64XRegClass);
34146 return std::make_pair(0U, &X86::FR64RegClass);
34147 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34155 if (VConstraint && Subtarget.hasVLX())
34156 return std::make_pair(0U, &X86::VR128XRegClass);
34157 return std::make_pair(0U, &X86::VR128RegClass);
34165 if (VConstraint && Subtarget.hasVLX())
34166 return std::make_pair(0U, &X86::VR256XRegClass);
34167 return std::make_pair(0U, &X86::VR256RegClass);
34172 return std::make_pair(0U, &X86::VR512RegClass);
34176 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
34177 switch (Constraint[1]) {
34181 // This register class doesn't allocate k0 for masked vector operation.
34182 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
34183 switch (VT.SimpleTy) {
34186 return std::make_pair(0U, &X86::VK32WMRegClass);
34188 return std::make_pair(0U, &X86::VK16WMRegClass);
34190 return std::make_pair(0U, &X86::VK8WMRegClass);
34192 return std::make_pair(0U, &X86::VK1WMRegClass);
34194 return std::make_pair(0U, &X86::VK64WMRegClass);
34201 // Use the default implementation in TargetLowering to convert the register
34202 // constraint into a member of a register class.
34203 std::pair<unsigned, const TargetRegisterClass*> Res;
34204 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
34206 // Not found as a standard register?
34208 // Map st(0) -> st(7) -> ST0
34209 if (Constraint.size() == 7 && Constraint[0] == '{' &&
34210 tolower(Constraint[1]) == 's' &&
34211 tolower(Constraint[2]) == 't' &&
34212 Constraint[3] == '(' &&
34213 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
34214 Constraint[5] == ')' &&
34215 Constraint[6] == '}') {
34217 Res.first = X86::FP0+Constraint[4]-'0';
34218 Res.second = &X86::RFP80RegClass;
34222 // GCC allows "st(0)" to be called just plain "st".
34223 if (StringRef("{st}").equals_lower(Constraint)) {
34224 Res.first = X86::FP0;
34225 Res.second = &X86::RFP80RegClass;
34230 if (StringRef("{flags}").equals_lower(Constraint)) {
34231 Res.first = X86::EFLAGS;
34232 Res.second = &X86::CCRRegClass;
34236 // 'A' means EAX + EDX.
34237 if (Constraint == "A") {
34238 Res.first = X86::EAX;
34239 Res.second = &X86::GR32_ADRegClass;
34245 // Otherwise, check to see if this is a register class of the wrong value
34246 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
34247 // turn into {ax},{dx}.
34248 // MVT::Other is used to specify clobber names.
34249 if (Res.second->hasType(VT) || VT == MVT::Other)
34250 return Res; // Correct type already, nothing to do.
34252 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
34253 // return "eax". This should even work for things like getting 64bit integer
34254 // registers when given an f64 type.
34255 const TargetRegisterClass *Class = Res.second;
34256 // The generic code will match the first register class that contains the
34257 // given register. Thus, based on the ordering of the tablegened file,
34258 // the "plain" GR classes might not come first.
34259 // Therefore, use a helper method.
34260 if (isGRClass(*Class)) {
34261 unsigned Size = VT.getSizeInBits();
34262 if (Size == 1) Size = 8;
34263 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
34265 Res.first = DestReg;
34266 Res.second = Size == 8 ? &X86::GR8RegClass
34267 : Size == 16 ? &X86::GR16RegClass
34268 : Size == 32 ? &X86::GR32RegClass
34269 : &X86::GR64RegClass;
34270 assert(Res.second->contains(Res.first) && "Register in register class");
34272 // No register found/type mismatch.
34274 Res.second = nullptr;
34276 } else if (isFRClass(*Class)) {
34277 // Handle references to XMM physical registers that got mapped into the
34278 // wrong class. This can happen with constraints like {xmm0} where the
34279 // target independent register mapper will just pick the first match it can
34280 // find, ignoring the required type.
34282 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34283 if (VT == MVT::f32 || VT == MVT::i32)
34284 Res.second = &X86::FR32RegClass;
34285 else if (VT == MVT::f64 || VT == MVT::i64)
34286 Res.second = &X86::FR64RegClass;
34287 else if (X86::VR128RegClass.hasType(VT))
34288 Res.second = &X86::VR128RegClass;
34289 else if (X86::VR256RegClass.hasType(VT))
34290 Res.second = &X86::VR256RegClass;
34291 else if (X86::VR512RegClass.hasType(VT))
34292 Res.second = &X86::VR512RegClass;
34294 // Type mismatch and not a clobber: Return an error;
34296 Res.second = nullptr;
34303 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
34304 const AddrMode &AM, Type *Ty,
34305 unsigned AS) const {
34306 // Scaling factors are not free at all.
34307 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
34308 // will take 2 allocations in the out of order engine instead of 1
34309 // for plain addressing mode, i.e. inst (reg1).
34311 // vaddps (%rsi,%drx), %ymm0, %ymm1
34312 // Requires two allocations (one for the load, one for the computation)
34314 // vaddps (%rsi), %ymm0, %ymm1
34315 // Requires just 1 allocation, i.e., freeing allocations for other operations
34316 // and having less micro operations to execute.
34318 // For some X86 architectures, this is even worse because for instance for
34319 // stores, the complex addressing mode forces the instruction to use the
34320 // "load" ports instead of the dedicated "store" port.
34321 // E.g., on Haswell:
34322 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
34323 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
34324 if (isLegalAddressingMode(DL, AM, Ty, AS))
34325 // Scale represents reg2 * scale, thus account for 1
34326 // as soon as we use a second register.
34327 return AM.Scale != 0;
34331 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
34332 // Integer division on x86 is expensive. However, when aggressively optimizing
34333 // for code size, we prefer to use a div instruction, as it is usually smaller
34334 // than the alternative sequence.
34335 // The exception to this is vector division. Since x86 doesn't have vector
34336 // integer division, leaving the division as-is is a loss even in terms of
34337 // size, because it will have to be scalarized, while the alternative code
34338 // sequence can be performed in vector form.
34339 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
34340 Attribute::MinSize);
34341 return OptSize && !VT.isVector();
34344 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
34345 if (!Subtarget.is64Bit())
34348 // Update IsSplitCSR in X86MachineFunctionInfo.
34349 X86MachineFunctionInfo *AFI =
34350 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
34351 AFI->setIsSplitCSR(true);
34354 void X86TargetLowering::insertCopiesSplitCSR(
34355 MachineBasicBlock *Entry,
34356 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
34357 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34358 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
34362 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34363 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
34364 MachineBasicBlock::iterator MBBI = Entry->begin();
34365 for (const MCPhysReg *I = IStart; *I; ++I) {
34366 const TargetRegisterClass *RC = nullptr;
34367 if (X86::GR64RegClass.contains(*I))
34368 RC = &X86::GR64RegClass;
34370 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
34372 unsigned NewVR = MRI->createVirtualRegister(RC);
34373 // Create copy from CSR to a virtual register.
34374 // FIXME: this currently does not emit CFI pseudo-instructions, it works
34375 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
34376 // nounwind. If we want to generalize this later, we may need to emit
34377 // CFI pseudo-instructions.
34378 assert(Entry->getParent()->getFunction()->hasFnAttribute(
34379 Attribute::NoUnwind) &&
34380 "Function should be nounwind in insertCopiesSplitCSR!");
34381 Entry->addLiveIn(*I);
34382 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
34385 // Insert the copy-back instructions right before the terminator.
34386 for (auto *Exit : Exits)
34387 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
34388 TII->get(TargetOpcode::COPY), *I)
34393 bool X86TargetLowering::supportSwiftError() const {
34394 return Subtarget.is64Bit();