1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/MathExtras.h"
56 #include "llvm/Target/TargetOptions.h"
63 #define DEBUG_TYPE "x86-isel"
65 STATISTIC(NumTailCalls, "Number of tail calls");
67 static cl::opt<bool> ExperimentalVectorWideningLegalization(
68 "x86-experimental-vector-widening-legalization", cl::init(false),
69 cl::desc("Enable an experimental vector type legalization through widening "
70 "rather than promotion."),
73 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
74 const X86Subtarget &STI)
75 : TargetLowering(TM), Subtarget(STI) {
76 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
77 X86ScalarSSEf64 = Subtarget.hasSSE2();
78 X86ScalarSSEf32 = Subtarget.hasSSE1();
79 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
81 // Set up the TargetLowering object.
83 // X86 is weird. It always uses i8 for shift amounts and setcc results.
84 setBooleanContents(ZeroOrOneBooleanContent);
85 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
86 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
88 // For 64-bit, since we have so many registers, use the ILP scheduler.
89 // For 32-bit, use the register pressure specific scheduling.
90 // For Atom, always use ILP scheduling.
91 if (Subtarget.isAtom())
92 setSchedulingPreference(Sched::ILP);
93 else if (Subtarget.is64Bit())
94 setSchedulingPreference(Sched::ILP);
96 setSchedulingPreference(Sched::RegPressure);
97 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
98 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
100 // Bypass expensive divides on Atom when compiling with O2.
101 if (TM.getOptLevel() >= CodeGenOpt::Default) {
102 if (Subtarget.hasSlowDivide32())
103 addBypassSlowDiv(32, 8);
104 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
105 addBypassSlowDiv(64, 16);
108 if (Subtarget.isTargetKnownWindowsMSVC() ||
109 Subtarget.isTargetWindowsItanium()) {
110 // Setup Windows compiler runtime calls.
111 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
112 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
113 setLibcallName(RTLIB::SREM_I64, "_allrem");
114 setLibcallName(RTLIB::UREM_I64, "_aullrem");
115 setLibcallName(RTLIB::MUL_I64, "_allmul");
116 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
117 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
118 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
119 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
120 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
123 if (Subtarget.isTargetDarwin()) {
124 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
125 setUseUnderscoreSetJmp(false);
126 setUseUnderscoreLongJmp(false);
127 } else if (Subtarget.isTargetWindowsGNU()) {
128 // MS runtime is weird: it exports _setjmp, but longjmp!
129 setUseUnderscoreSetJmp(true);
130 setUseUnderscoreLongJmp(false);
132 setUseUnderscoreSetJmp(true);
133 setUseUnderscoreLongJmp(true);
136 // Set up the register classes.
137 addRegisterClass(MVT::i8, &X86::GR8RegClass);
138 addRegisterClass(MVT::i16, &X86::GR16RegClass);
139 addRegisterClass(MVT::i32, &X86::GR32RegClass);
140 if (Subtarget.is64Bit())
141 addRegisterClass(MVT::i64, &X86::GR64RegClass);
143 for (MVT VT : MVT::integer_valuetypes())
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
146 // We don't accept any truncstore of integer registers.
147 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
148 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
149 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
150 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
151 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
152 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
154 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
156 // SETOEQ and SETUNE require checking two conditions.
157 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
158 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
159 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
160 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
161 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
162 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
164 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
166 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
167 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
168 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
170 if (Subtarget.is64Bit()) {
171 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
172 // f32/f64 are legal, f80 is custom.
173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
175 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
177 } else if (!Subtarget.useSoftFloat()) {
178 // We have an algorithm for SSE2->double, and we turn this into a
179 // 64-bit FILD followed by conditional FADD for other targets.
180 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
181 // We have an algorithm for SSE2, and we turn this into a 64-bit
182 // FILD or VCVTUSI2SS/SD for other targets.
183 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
186 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
188 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
189 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
191 if (!Subtarget.useSoftFloat()) {
192 // SSE has no i16 to fp conversion, only i32.
193 if (X86ScalarSSEf32) {
194 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
195 // f32 and f64 cases are Legal, f80 case is not
196 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
198 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
199 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
203 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
206 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
208 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
209 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
211 if (!Subtarget.useSoftFloat()) {
212 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
213 // are Legal, f80 is custom lowered.
214 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
215 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
217 if (X86ScalarSSEf32) {
218 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
219 // f32 and f64 cases are Legal, f80 case is not
220 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
222 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
223 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
227 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
228 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
231 // Handle FP_TO_UINT by promoting the destination to a larger signed
233 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
234 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
235 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
237 if (Subtarget.is64Bit()) {
238 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
239 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
240 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
241 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
244 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
246 } else if (!Subtarget.useSoftFloat()) {
247 // Since AVX is a superset of SSE3, only check for SSE here.
248 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
249 // Expand FP_TO_UINT into a select.
250 // FIXME: We would like to use a Custom expander here eventually to do
251 // the optimal thing for SSE vs. the default expansion in the legalizer.
252 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
254 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
255 // With SSE3 we can use fisttpll to convert to a signed i64; without
256 // SSE, we're stuck with a fistpll.
257 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
262 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
263 if (!X86ScalarSSEf64) {
264 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
265 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
266 if (Subtarget.is64Bit()) {
267 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
268 // Without SSE, i64->f64 goes through memory.
269 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
271 } else if (!Subtarget.is64Bit())
272 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
274 // Scalar integer divide and remainder are lowered to use operations that
275 // produce two results, to match the available instructions. This exposes
276 // the two-result form to trivial CSE, which is able to combine x/y and x%y
277 // into a single instruction.
279 // Scalar integer multiply-high is also lowered to use two-result
280 // operations, to match the available instructions. However, plain multiply
281 // (low) operations are left as Legal, as there are single-result
282 // instructions for this in x86. Using the two-result multiply instructions
283 // when both high and low results are needed must be arranged by dagcombine.
284 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
285 setOperationAction(ISD::MULHS, VT, Expand);
286 setOperationAction(ISD::MULHU, VT, Expand);
287 setOperationAction(ISD::SDIV, VT, Expand);
288 setOperationAction(ISD::UDIV, VT, Expand);
289 setOperationAction(ISD::SREM, VT, Expand);
290 setOperationAction(ISD::UREM, VT, Expand);
293 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
294 if (VT == MVT::i64 && !Subtarget.is64Bit())
296 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
297 setOperationAction(ISD::ADDC, VT, Custom);
298 setOperationAction(ISD::ADDE, VT, Custom);
299 setOperationAction(ISD::SUBC, VT, Custom);
300 setOperationAction(ISD::SUBE, VT, Custom);
303 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
304 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
305 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
306 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
307 setOperationAction(ISD::BR_CC, VT, Expand);
308 setOperationAction(ISD::SELECT_CC, VT, Expand);
310 if (Subtarget.is64Bit())
311 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
312 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
313 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
314 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
315 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
317 setOperationAction(ISD::FREM , MVT::f32 , Expand);
318 setOperationAction(ISD::FREM , MVT::f64 , Expand);
319 setOperationAction(ISD::FREM , MVT::f80 , Expand);
320 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
322 // Promote the i8 variants and force them on up to i32 which has a shorter
324 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
325 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
326 if (!Subtarget.hasBMI()) {
327 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
328 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
329 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
330 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
331 if (Subtarget.is64Bit()) {
332 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
333 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
337 if (Subtarget.hasLZCNT()) {
338 // When promoting the i8 variants, force them to i32 for a shorter
340 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
341 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
344 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
347 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
348 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
349 if (Subtarget.is64Bit()) {
350 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
351 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355 // Special handling for half-precision floating point conversions.
356 // If we don't have F16C support, then lower half float conversions
357 // into library calls.
358 if (Subtarget.useSoftFloat() ||
359 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
360 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
361 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
364 // There's never any support for operations beyond MVT::f32.
365 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
366 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
367 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
368 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
370 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
371 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
372 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
373 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
374 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
375 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
377 if (Subtarget.hasPOPCNT()) {
378 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
380 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
381 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
382 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
383 if (Subtarget.is64Bit())
384 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
387 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
389 if (!Subtarget.hasMOVBE())
390 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
392 // These should be promoted to a larger select which is supported.
393 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
394 // X86 wants to expand cmov itself.
395 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
396 setOperationAction(ISD::SELECT, VT, Custom);
397 setOperationAction(ISD::SETCC, VT, Custom);
399 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
400 if (VT == MVT::i64 && !Subtarget.is64Bit())
402 setOperationAction(ISD::SELECT, VT, Custom);
403 setOperationAction(ISD::SETCC, VT, Custom);
404 setOperationAction(ISD::SETCCE, VT, Custom);
406 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
407 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
408 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
409 // support continuation, user-level threading, and etc.. As a result, no
410 // other SjLj exception interfaces are implemented and please don't build
411 // your own exception handling based on them.
412 // LLVM/Clang supports zero-cost DWARF exception handling.
413 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
414 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
415 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
416 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
417 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
420 for (auto VT : { MVT::i32, MVT::i64 }) {
421 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 setOperationAction(ISD::ConstantPool , VT, Custom);
424 setOperationAction(ISD::JumpTable , VT, Custom);
425 setOperationAction(ISD::GlobalAddress , VT, Custom);
426 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
427 setOperationAction(ISD::ExternalSymbol , VT, Custom);
428 setOperationAction(ISD::BlockAddress , VT, Custom);
430 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
431 for (auto VT : { MVT::i32, MVT::i64 }) {
432 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 setOperationAction(ISD::SHL_PARTS, VT, Custom);
435 setOperationAction(ISD::SRA_PARTS, VT, Custom);
436 setOperationAction(ISD::SRL_PARTS, VT, Custom);
439 if (Subtarget.hasSSE1())
440 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
442 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
444 // Expand certain atomics
445 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
446 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
447 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
448 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
449 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
450 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
451 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
452 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
455 if (Subtarget.hasCmpxchg16b()) {
456 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
459 // FIXME - use subtarget debug flags
460 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
461 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
462 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
463 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
466 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
467 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
469 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
470 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
472 setOperationAction(ISD::TRAP, MVT::Other, Legal);
473 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
475 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
476 setOperationAction(ISD::VASTART , MVT::Other, Custom);
477 setOperationAction(ISD::VAEND , MVT::Other, Expand);
478 bool Is64Bit = Subtarget.is64Bit();
479 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
480 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
482 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
483 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
485 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
487 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
488 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
489 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
491 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
492 // f32 and f64 use SSE.
493 // Set up the FP register classes.
494 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
495 : &X86::FR32RegClass);
496 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
497 : &X86::FR64RegClass);
499 for (auto VT : { MVT::f32, MVT::f64 }) {
500 // Use ANDPD to simulate FABS.
501 setOperationAction(ISD::FABS, VT, Custom);
503 // Use XORP to simulate FNEG.
504 setOperationAction(ISD::FNEG, VT, Custom);
506 // Use ANDPD and ORPD to simulate FCOPYSIGN.
507 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
509 // We don't support sin/cos/fmod
510 setOperationAction(ISD::FSIN , VT, Expand);
511 setOperationAction(ISD::FCOS , VT, Expand);
512 setOperationAction(ISD::FSINCOS, VT, Expand);
515 // Lower this to MOVMSK plus an AND.
516 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
517 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
519 // Expand FP immediates into loads from the stack, except for the special
521 addLegalFPImmediate(APFloat(+0.0)); // xorpd
522 addLegalFPImmediate(APFloat(+0.0f)); // xorps
523 } else if (UseX87 && X86ScalarSSEf32) {
524 // Use SSE for f32, x87 for f64.
525 // Set up the FP register classes.
526 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
527 : &X86::FR32RegClass);
528 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
530 // Use ANDPS to simulate FABS.
531 setOperationAction(ISD::FABS , MVT::f32, Custom);
533 // Use XORP to simulate FNEG.
534 setOperationAction(ISD::FNEG , MVT::f32, Custom);
536 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
538 // Use ANDPS and ORPS to simulate FCOPYSIGN.
539 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
540 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
542 // We don't support sin/cos/fmod
543 setOperationAction(ISD::FSIN , MVT::f32, Expand);
544 setOperationAction(ISD::FCOS , MVT::f32, Expand);
545 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
547 // Special cases we handle for FP constants.
548 addLegalFPImmediate(APFloat(+0.0f)); // xorps
549 addLegalFPImmediate(APFloat(+0.0)); // FLD0
550 addLegalFPImmediate(APFloat(+1.0)); // FLD1
551 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
552 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
554 if (!TM.Options.UnsafeFPMath) {
555 setOperationAction(ISD::FSIN , MVT::f64, Expand);
556 setOperationAction(ISD::FCOS , MVT::f64, Expand);
557 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
560 // f32 and f64 in x87.
561 // Set up the FP register classes.
562 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
563 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
565 for (auto VT : { MVT::f32, MVT::f64 }) {
566 setOperationAction(ISD::UNDEF, VT, Expand);
567 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
569 if (!TM.Options.UnsafeFPMath) {
570 setOperationAction(ISD::FSIN , VT, Expand);
571 setOperationAction(ISD::FCOS , VT, Expand);
572 setOperationAction(ISD::FSINCOS, VT, Expand);
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
579 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
580 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
581 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
582 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
585 // We don't support FMA.
586 setOperationAction(ISD::FMA, MVT::f64, Expand);
587 setOperationAction(ISD::FMA, MVT::f32, Expand);
589 // Long double always uses X87, except f128 in MMX.
591 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
592 addRegisterClass(MVT::f128, &X86::FR128RegClass);
593 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
594 setOperationAction(ISD::FABS , MVT::f128, Custom);
595 setOperationAction(ISD::FNEG , MVT::f128, Custom);
596 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
599 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
600 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
601 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
603 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
604 addLegalFPImmediate(TmpFlt); // FLD0
606 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
609 APFloat TmpFlt2(+1.0);
610 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
612 addLegalFPImmediate(TmpFlt2); // FLD1
613 TmpFlt2.changeSign();
614 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
617 if (!TM.Options.UnsafeFPMath) {
618 setOperationAction(ISD::FSIN , MVT::f80, Expand);
619 setOperationAction(ISD::FCOS , MVT::f80, Expand);
620 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
623 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
624 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
625 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
626 setOperationAction(ISD::FRINT, MVT::f80, Expand);
627 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
628 setOperationAction(ISD::FMA, MVT::f80, Expand);
631 // Always use a library call for pow.
632 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
633 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
634 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
636 setOperationAction(ISD::FLOG, MVT::f80, Expand);
637 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
638 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
639 setOperationAction(ISD::FEXP, MVT::f80, Expand);
640 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
641 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
642 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
644 // Some FP actions are always expanded for vector types.
645 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
646 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
647 setOperationAction(ISD::FSIN, VT, Expand);
648 setOperationAction(ISD::FSINCOS, VT, Expand);
649 setOperationAction(ISD::FCOS, VT, Expand);
650 setOperationAction(ISD::FREM, VT, Expand);
651 setOperationAction(ISD::FPOWI, VT, Expand);
652 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
653 setOperationAction(ISD::FPOW, VT, Expand);
654 setOperationAction(ISD::FLOG, VT, Expand);
655 setOperationAction(ISD::FLOG2, VT, Expand);
656 setOperationAction(ISD::FLOG10, VT, Expand);
657 setOperationAction(ISD::FEXP, VT, Expand);
658 setOperationAction(ISD::FEXP2, VT, Expand);
661 // First set operation action for all vector types to either promote
662 // (for widening) or expand (for scalarization). Then we will selectively
663 // turn on ones that can be effectively codegen'd.
664 for (MVT VT : MVT::vector_valuetypes()) {
665 setOperationAction(ISD::SDIV, VT, Expand);
666 setOperationAction(ISD::UDIV, VT, Expand);
667 setOperationAction(ISD::SREM, VT, Expand);
668 setOperationAction(ISD::UREM, VT, Expand);
669 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
670 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
671 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
672 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
673 setOperationAction(ISD::FMA, VT, Expand);
674 setOperationAction(ISD::FFLOOR, VT, Expand);
675 setOperationAction(ISD::FCEIL, VT, Expand);
676 setOperationAction(ISD::FTRUNC, VT, Expand);
677 setOperationAction(ISD::FRINT, VT, Expand);
678 setOperationAction(ISD::FNEARBYINT, VT, Expand);
679 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
680 setOperationAction(ISD::MULHS, VT, Expand);
681 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
682 setOperationAction(ISD::MULHU, VT, Expand);
683 setOperationAction(ISD::SDIVREM, VT, Expand);
684 setOperationAction(ISD::UDIVREM, VT, Expand);
685 setOperationAction(ISD::CTPOP, VT, Expand);
686 setOperationAction(ISD::CTTZ, VT, Expand);
687 setOperationAction(ISD::CTLZ, VT, Expand);
688 setOperationAction(ISD::ROTL, VT, Expand);
689 setOperationAction(ISD::ROTR, VT, Expand);
690 setOperationAction(ISD::BSWAP, VT, Expand);
691 setOperationAction(ISD::SETCC, VT, Expand);
692 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
693 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
694 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
695 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
696 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
697 setOperationAction(ISD::TRUNCATE, VT, Expand);
698 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
699 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
700 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
701 setOperationAction(ISD::SELECT_CC, VT, Expand);
702 for (MVT InnerVT : MVT::vector_valuetypes()) {
703 setTruncStoreAction(InnerVT, VT, Expand);
705 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
706 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
708 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
709 // types, we have to deal with them whether we ask for Expansion or not.
710 // Setting Expand causes its own optimisation problems though, so leave
712 if (VT.getVectorElementType() == MVT::i1)
713 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
715 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
716 // split/scalarized right now.
717 if (VT.getVectorElementType() == MVT::f16)
718 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
722 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
723 // with -msoft-float, disable use of MMX as well.
724 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
725 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
726 // No operations on x86mmx supported, everything uses intrinsics.
729 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
733 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
734 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
735 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
736 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
737 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
738 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
739 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
740 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
741 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
744 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
745 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
746 : &X86::VR128RegClass);
748 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
749 // registers cannot be used even for integer operations.
750 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
752 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
754 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
755 : &X86::VR128RegClass);
756 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
757 : &X86::VR128RegClass);
759 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
760 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
761 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
762 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
763 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
764 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
765 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
766 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
767 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
768 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
769 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
770 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
771 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
773 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
774 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
775 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
776 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
778 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
779 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
780 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
781 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
783 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
784 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
787 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
789 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
790 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
791 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
792 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
794 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
795 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
796 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
797 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
799 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
800 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
801 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
802 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
803 setOperationAction(ISD::VSELECT, VT, Custom);
804 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
807 // We support custom legalizing of sext and anyext loads for specific
808 // memory vector types which we can load as a scalar (or sequence of
809 // scalars) and extend in-register to a legal 128-bit vector type. For sext
810 // loads these must work with a single scalar load.
811 for (MVT VT : MVT::integer_vector_valuetypes()) {
812 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
813 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
814 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
815 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
816 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
817 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
818 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
819 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
820 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
823 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
824 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
825 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
826 setOperationAction(ISD::VSELECT, VT, Custom);
828 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
831 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
832 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
835 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
836 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
837 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
838 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
839 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
840 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
841 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
844 // Custom lower v2i64 and v2f64 selects.
845 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
846 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
848 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
849 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
851 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
852 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
854 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
855 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
856 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
858 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
859 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
861 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
862 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
864 for (MVT VT : MVT::fp_vector_valuetypes())
865 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
867 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
868 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
869 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
871 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
872 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
873 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
875 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
876 setOperationAction(ISD::SRL, VT, Custom);
877 setOperationAction(ISD::SHL, VT, Custom);
878 setOperationAction(ISD::SRA, VT, Custom);
881 // In the customized shift lowering, the legal cases in AVX2 will be
883 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
884 setOperationAction(ISD::SRL, VT, Custom);
885 setOperationAction(ISD::SHL, VT, Custom);
886 setOperationAction(ISD::SRA, VT, Custom);
890 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
891 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
892 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
893 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
894 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
895 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
898 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
899 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
900 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
901 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
902 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
903 setOperationAction(ISD::FRINT, RoundedTy, Legal);
904 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
907 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
908 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
909 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
910 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
911 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
916 // FIXME: Do we need to handle scalar-to-vector here?
917 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
919 // We directly match byte blends in the backend as they match the VSELECT
921 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
923 // SSE41 brings specific instructions for doing vector sign extend even in
924 // cases where we don't have SRA.
925 for (MVT VT : MVT::integer_vector_valuetypes()) {
926 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
927 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
928 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
931 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
932 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
933 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
934 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
935 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
936 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
937 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
939 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
940 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
941 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
942 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
943 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
944 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
946 // i8 vectors are custom because the source register and source
947 // source memory operand types are not the same width.
948 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
951 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
952 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
953 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
954 setOperationAction(ISD::ROTL, VT, Custom);
956 // XOP can efficiently perform BITREVERSE with VPPERM.
957 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
958 setOperationAction(ISD::BITREVERSE, VT, Custom);
960 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
961 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
962 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
966 bool HasInt256 = Subtarget.hasInt256();
968 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
969 : &X86::VR256RegClass);
970 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
971 : &X86::VR256RegClass);
972 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
973 : &X86::VR256RegClass);
974 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975 : &X86::VR256RegClass);
976 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
977 : &X86::VR256RegClass);
978 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979 : &X86::VR256RegClass);
981 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
982 setOperationAction(ISD::FFLOOR, VT, Legal);
983 setOperationAction(ISD::FCEIL, VT, Legal);
984 setOperationAction(ISD::FTRUNC, VT, Legal);
985 setOperationAction(ISD::FRINT, VT, Legal);
986 setOperationAction(ISD::FNEARBYINT, VT, Legal);
987 setOperationAction(ISD::FNEG, VT, Custom);
988 setOperationAction(ISD::FABS, VT, Custom);
989 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
992 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
993 // even though v8i16 is a legal type.
994 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
995 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
996 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
998 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
999 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1000 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1002 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1003 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1005 for (MVT VT : MVT::fp_vector_valuetypes())
1006 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1008 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
1009 setOperationAction(ISD::SRL, VT, Custom);
1010 setOperationAction(ISD::SHL, VT, Custom);
1011 setOperationAction(ISD::SRA, VT, Custom);
1014 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1015 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1016 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1017 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1020 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1021 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1023 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1024 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1025 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1026 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1027 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1028 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1029 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1030 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1031 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1032 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1033 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1035 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1037 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1038 setOperationAction(ISD::CTPOP, VT, Custom);
1039 setOperationAction(ISD::CTTZ, VT, Custom);
1040 setOperationAction(ISD::CTLZ, VT, Custom);
1043 if (Subtarget.hasAnyFMA()) {
1044 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1045 MVT::v2f64, MVT::v4f64 })
1046 setOperationAction(ISD::FMA, VT, Legal);
1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1050 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1055 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1057 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1059 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1060 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1062 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1063 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1065 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1067 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1068 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1069 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1070 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1076 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1077 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1079 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1080 // when we have a 256bit-wide blend with immediate.
1081 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1083 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1084 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1085 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1086 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1087 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1088 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1089 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1091 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1092 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1093 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1094 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1095 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1096 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1099 // In the customized shift lowering, the legal cases in AVX2 will be
1101 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1102 setOperationAction(ISD::SRL, VT, Custom);
1103 setOperationAction(ISD::SHL, VT, Custom);
1104 setOperationAction(ISD::SRA, VT, Custom);
1107 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1108 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1109 setOperationAction(ISD::MLOAD, VT, Legal);
1110 setOperationAction(ISD::MSTORE, VT, Legal);
1113 // Extract subvector is special because the value type
1114 // (result) is 128-bit but the source is 256-bit wide.
1115 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1116 MVT::v4f32, MVT::v2f64 }) {
1117 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1120 // Custom lower several nodes for 256-bit types.
1121 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1122 MVT::v8f32, MVT::v4f64 }) {
1123 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1124 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1125 setOperationAction(ISD::VSELECT, VT, Custom);
1126 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1127 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1128 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1129 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1130 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1134 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1136 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1137 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1138 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1139 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1140 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1141 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1142 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1146 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1147 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1148 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1149 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1150 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1152 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1153 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1154 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1156 for (MVT VT : MVT::fp_vector_valuetypes())
1157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1159 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1160 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1161 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1162 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1163 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1164 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1165 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1167 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1168 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1169 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1170 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1171 setOperationAction(ISD::XOR, MVT::i1, Legal);
1172 setOperationAction(ISD::OR, MVT::i1, Legal);
1173 setOperationAction(ISD::AND, MVT::i1, Legal);
1174 setOperationAction(ISD::SUB, MVT::i1, Custom);
1175 setOperationAction(ISD::ADD, MVT::i1, Custom);
1176 setOperationAction(ISD::MUL, MVT::i1, Custom);
1178 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1179 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1180 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1181 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1182 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1183 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1184 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1185 setTruncStoreAction(VT, MaskVT, Custom);
1188 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1189 setOperationAction(ISD::FNEG, VT, Custom);
1190 setOperationAction(ISD::FABS, VT, Custom);
1191 setOperationAction(ISD::FMA, VT, Legal);
1192 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1195 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1196 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1197 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1199 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1200 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1201 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1202 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1203 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1204 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1205 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1206 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1207 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1208 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1209 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1210 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1211 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1212 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1213 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1214 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1215 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1216 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1217 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1218 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1219 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1221 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1222 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1223 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1224 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1225 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1226 if (Subtarget.hasVLX()){
1227 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1228 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1229 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1230 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1231 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1233 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1234 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1235 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1236 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1237 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1239 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1240 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1241 setOperationAction(ISD::MLOAD, VT, Custom);
1242 setOperationAction(ISD::MSTORE, VT, Custom);
1245 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1246 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1248 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1249 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1251 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1252 if (Subtarget.hasDQI()) {
1253 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1254 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1255 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1256 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1257 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1258 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1259 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1260 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1261 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1262 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1263 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1264 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1266 if (Subtarget.hasVLX()) {
1267 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1268 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1269 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1270 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1273 if (Subtarget.hasVLX()) {
1274 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1275 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1276 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1278 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1279 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1280 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1281 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1282 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1284 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1285 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1286 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1287 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1288 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1289 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1290 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1291 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1292 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1293 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1294 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1297 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1298 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1299 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1300 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1301 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1302 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1303 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1304 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1305 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1306 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1309 if (Subtarget.hasDQI()) {
1310 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1311 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1313 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1314 setOperationAction(ISD::FFLOOR, VT, Legal);
1315 setOperationAction(ISD::FCEIL, VT, Legal);
1316 setOperationAction(ISD::FTRUNC, VT, Legal);
1317 setOperationAction(ISD::FRINT, VT, Legal);
1318 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1321 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1324 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1325 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1326 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1329 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1330 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1331 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1332 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1334 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1335 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1337 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1339 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1340 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1341 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1342 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1343 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1344 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1345 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1346 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1347 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1348 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1349 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1350 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1352 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1353 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1354 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1355 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1356 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1357 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1358 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1359 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1361 setOperationAction(ISD::ADD, MVT::v8i1, Expand);
1362 setOperationAction(ISD::ADD, MVT::v16i1, Expand);
1363 setOperationAction(ISD::SUB, MVT::v8i1, Expand);
1364 setOperationAction(ISD::SUB, MVT::v16i1, Expand);
1365 setOperationAction(ISD::MUL, MVT::v8i1, Expand);
1366 setOperationAction(ISD::MUL, MVT::v16i1, Expand);
1368 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1370 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1371 setOperationAction(ISD::SRL, VT, Custom);
1372 setOperationAction(ISD::SHL, VT, Custom);
1373 setOperationAction(ISD::SRA, VT, Custom);
1374 setOperationAction(ISD::CTPOP, VT, Custom);
1375 setOperationAction(ISD::CTTZ, VT, Custom);
1378 // Need to promote to 64-bit even though we have 32-bit masked instructions
1379 // because the IR optimizers rearrange bitcasts around logic ops leaving
1380 // too many variations to handle if we don't promote them.
1381 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1382 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1383 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1385 if (Subtarget.hasCDI()) {
1386 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1387 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1389 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1390 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1391 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1392 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1394 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1395 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1397 if (Subtarget.hasVLX()) {
1398 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1399 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1400 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1401 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1403 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1404 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1405 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1406 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1409 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1410 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1411 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1412 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1413 } // Subtarget.hasCDI()
1415 if (Subtarget.hasDQI()) {
1416 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1417 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1418 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1419 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1422 // Custom lower several nodes.
1423 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1424 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1425 setOperationAction(ISD::MGATHER, VT, Custom);
1426 setOperationAction(ISD::MSCATTER, VT, Custom);
1428 // Extract subvector is special because the value type
1429 // (result) is 256-bit but the source is 512-bit wide.
1430 // 128-bit was made Custom under AVX1.
1431 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1432 MVT::v8f32, MVT::v4f64 })
1433 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1434 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1435 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1436 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1438 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1439 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1440 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1441 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1442 setOperationAction(ISD::VSELECT, VT, Legal);
1443 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1444 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1445 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1446 setOperationAction(ISD::MLOAD, VT, Legal);
1447 setOperationAction(ISD::MSTORE, VT, Legal);
1448 setOperationAction(ISD::MGATHER, VT, Legal);
1449 setOperationAction(ISD::MSCATTER, VT, Custom);
1451 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1452 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1453 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1457 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1458 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1459 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1461 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1462 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1464 setOperationAction(ISD::ADD, MVT::v32i1, Expand);
1465 setOperationAction(ISD::ADD, MVT::v64i1, Expand);
1466 setOperationAction(ISD::SUB, MVT::v32i1, Expand);
1467 setOperationAction(ISD::SUB, MVT::v64i1, Expand);
1468 setOperationAction(ISD::MUL, MVT::v32i1, Expand);
1469 setOperationAction(ISD::MUL, MVT::v64i1, Expand);
1471 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1472 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1473 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1474 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1475 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1476 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1477 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1478 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1479 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1480 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1481 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1482 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1483 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
1484 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
1485 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1486 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1487 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1489 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1490 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1491 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1492 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1493 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1494 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1495 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1496 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1497 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1498 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1499 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1500 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1501 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1502 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1503 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1505 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1506 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1507 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1508 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1509 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1510 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1511 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1512 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1513 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1514 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1515 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1516 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1517 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1519 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1520 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1521 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1522 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1523 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1524 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1525 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1526 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1528 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1530 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1531 if (Subtarget.hasVLX()) {
1532 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1533 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1536 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1537 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1538 setOperationAction(ISD::MLOAD, VT, Action);
1539 setOperationAction(ISD::MSTORE, VT, Action);
1542 if (Subtarget.hasCDI()) {
1543 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1544 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1547 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1548 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1549 setOperationAction(ISD::VSELECT, VT, Legal);
1550 setOperationAction(ISD::SRL, VT, Custom);
1551 setOperationAction(ISD::SHL, VT, Custom);
1552 setOperationAction(ISD::SRA, VT, Custom);
1553 setOperationAction(ISD::MLOAD, VT, Legal);
1554 setOperationAction(ISD::MSTORE, VT, Legal);
1555 setOperationAction(ISD::CTPOP, VT, Custom);
1556 setOperationAction(ISD::CTTZ, VT, Custom);
1558 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1559 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1560 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1563 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1564 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1565 if (Subtarget.hasVLX()) {
1566 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1567 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1568 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1573 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1574 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1575 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1577 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1578 setOperationAction(ISD::ADD, VT, Expand);
1579 setOperationAction(ISD::SUB, VT, Expand);
1580 setOperationAction(ISD::MUL, VT, Expand);
1581 setOperationAction(ISD::VSELECT, VT, Expand);
1583 setOperationAction(ISD::TRUNCATE, VT, Custom);
1584 setOperationAction(ISD::SETCC, VT, Custom);
1585 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1586 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1587 setOperationAction(ISD::SELECT, VT, Custom);
1588 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1589 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1592 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1593 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1594 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1595 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1597 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1598 setOperationAction(ISD::SMAX, VT, Legal);
1599 setOperationAction(ISD::UMAX, VT, Legal);
1600 setOperationAction(ISD::SMIN, VT, Legal);
1601 setOperationAction(ISD::UMIN, VT, Legal);
1605 // We want to custom lower some of our intrinsics.
1606 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1607 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1608 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1609 if (!Subtarget.is64Bit()) {
1610 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1611 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1614 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1615 // handle type legalization for these operations here.
1617 // FIXME: We really should do custom legalization for addition and
1618 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1619 // than generic legalization for 64-bit multiplication-with-overflow, though.
1620 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1621 if (VT == MVT::i64 && !Subtarget.is64Bit())
1623 // Add/Sub/Mul with overflow operations are custom lowered.
1624 setOperationAction(ISD::SADDO, VT, Custom);
1625 setOperationAction(ISD::UADDO, VT, Custom);
1626 setOperationAction(ISD::SSUBO, VT, Custom);
1627 setOperationAction(ISD::USUBO, VT, Custom);
1628 setOperationAction(ISD::SMULO, VT, Custom);
1629 setOperationAction(ISD::UMULO, VT, Custom);
1632 if (!Subtarget.is64Bit()) {
1633 // These libcalls are not available in 32-bit.
1634 setLibcallName(RTLIB::SHL_I128, nullptr);
1635 setLibcallName(RTLIB::SRL_I128, nullptr);
1636 setLibcallName(RTLIB::SRA_I128, nullptr);
1639 // Combine sin / cos into one node or libcall if possible.
1640 if (Subtarget.hasSinCos()) {
1641 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1642 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1643 if (Subtarget.isTargetDarwin()) {
1644 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1645 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1646 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1647 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1651 if (Subtarget.isTargetWin64()) {
1652 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1653 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1654 setOperationAction(ISD::SREM, MVT::i128, Custom);
1655 setOperationAction(ISD::UREM, MVT::i128, Custom);
1656 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1657 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1660 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1661 // is. We should promote the value to 64-bits to solve this.
1662 // This is what the CRT headers do - `fmodf` is an inline header
1663 // function casting to f64 and calling `fmod`.
1664 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1665 Subtarget.isTargetWindowsItanium()))
1666 for (ISD::NodeType Op :
1667 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1668 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1669 if (isOperationExpand(Op, MVT::f32))
1670 setOperationAction(Op, MVT::f32, Promote);
1672 // We have target-specific dag combine patterns for the following nodes:
1673 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1674 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1675 setTargetDAGCombine(ISD::BITCAST);
1676 setTargetDAGCombine(ISD::VSELECT);
1677 setTargetDAGCombine(ISD::SELECT);
1678 setTargetDAGCombine(ISD::SHL);
1679 setTargetDAGCombine(ISD::SRA);
1680 setTargetDAGCombine(ISD::SRL);
1681 setTargetDAGCombine(ISD::OR);
1682 setTargetDAGCombine(ISD::AND);
1683 setTargetDAGCombine(ISD::ADD);
1684 setTargetDAGCombine(ISD::FADD);
1685 setTargetDAGCombine(ISD::FSUB);
1686 setTargetDAGCombine(ISD::FNEG);
1687 setTargetDAGCombine(ISD::FMA);
1688 setTargetDAGCombine(ISD::FMINNUM);
1689 setTargetDAGCombine(ISD::FMAXNUM);
1690 setTargetDAGCombine(ISD::SUB);
1691 setTargetDAGCombine(ISD::LOAD);
1692 setTargetDAGCombine(ISD::MLOAD);
1693 setTargetDAGCombine(ISD::STORE);
1694 setTargetDAGCombine(ISD::MSTORE);
1695 setTargetDAGCombine(ISD::TRUNCATE);
1696 setTargetDAGCombine(ISD::ZERO_EXTEND);
1697 setTargetDAGCombine(ISD::ANY_EXTEND);
1698 setTargetDAGCombine(ISD::SIGN_EXTEND);
1699 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1700 setTargetDAGCombine(ISD::SINT_TO_FP);
1701 setTargetDAGCombine(ISD::UINT_TO_FP);
1702 setTargetDAGCombine(ISD::SETCC);
1703 setTargetDAGCombine(ISD::MUL);
1704 setTargetDAGCombine(ISD::XOR);
1705 setTargetDAGCombine(ISD::MSCATTER);
1706 setTargetDAGCombine(ISD::MGATHER);
1708 computeRegisterProperties(Subtarget.getRegisterInfo());
1710 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1711 MaxStoresPerMemsetOptSize = 8;
1712 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1713 MaxStoresPerMemcpyOptSize = 4;
1714 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1715 MaxStoresPerMemmoveOptSize = 4;
1716 setPrefLoopAlignment(4); // 2^4 bytes.
1718 // An out-of-order CPU can speculatively execute past a predictable branch,
1719 // but a conditional move could be stalled by an expensive earlier operation.
1720 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1721 EnableExtLdPromotion = true;
1722 setPrefFunctionAlignment(4); // 2^4 bytes.
1724 verifyIntrinsicTables();
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734 if (ExperimentalVectorWideningLegalization &&
1735 VT.getVectorNumElements() != 1 &&
1736 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737 return TypeWidenVector;
1739 return TargetLoweringBase::getPreferredVectorAction(VT);
1742 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1743 LLVMContext& Context,
1746 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1748 if (VT.isSimple()) {
1749 MVT VVT = VT.getSimpleVT();
1750 const unsigned NumElts = VVT.getVectorNumElements();
1751 MVT EltVT = VVT.getVectorElementType();
1752 if (VVT.is512BitVector()) {
1753 if (Subtarget.hasAVX512())
1754 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1755 EltVT == MVT::f32 || EltVT == MVT::f64)
1757 case 8: return MVT::v8i1;
1758 case 16: return MVT::v16i1;
1760 if (Subtarget.hasBWI())
1761 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1763 case 32: return MVT::v32i1;
1764 case 64: return MVT::v64i1;
1768 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1769 return MVT::getVectorVT(MVT::i1, NumElts);
1771 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1772 EVT LegalVT = getTypeToTransformTo(Context, VT);
1773 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1776 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1778 case 2: return MVT::v2i1;
1779 case 4: return MVT::v4i1;
1780 case 8: return MVT::v8i1;
1784 return VT.changeVectorElementTypeToInteger();
1787 /// Helper for getByValTypeAlignment to determine
1788 /// the desired ByVal argument alignment.
1789 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1792 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1793 if (VTy->getBitWidth() == 128)
1795 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1796 unsigned EltAlign = 0;
1797 getMaxByValAlign(ATy->getElementType(), EltAlign);
1798 if (EltAlign > MaxAlign)
1799 MaxAlign = EltAlign;
1800 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1801 for (auto *EltTy : STy->elements()) {
1802 unsigned EltAlign = 0;
1803 getMaxByValAlign(EltTy, EltAlign);
1804 if (EltAlign > MaxAlign)
1805 MaxAlign = EltAlign;
1812 /// Return the desired alignment for ByVal aggregate
1813 /// function arguments in the caller parameter area. For X86, aggregates
1814 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1815 /// are at 4-byte boundaries.
1816 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1817 const DataLayout &DL) const {
1818 if (Subtarget.is64Bit()) {
1819 // Max of 8 and alignment of type.
1820 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1827 if (Subtarget.hasSSE1())
1828 getMaxByValAlign(Ty, Align);
1832 /// Returns the target specific optimal type for load
1833 /// and store operations as a result of memset, memcpy, and memmove
1834 /// lowering. If DstAlign is zero that means it's safe to destination
1835 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1836 /// means there isn't a need to check it against alignment requirement,
1837 /// probably because the source does not need to be loaded. If 'IsMemset' is
1838 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1839 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1840 /// source is constant so it does not need to be loaded.
1841 /// It returns EVT::Other if the type should be determined using generic
1842 /// target-independent logic.
1844 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1845 unsigned DstAlign, unsigned SrcAlign,
1846 bool IsMemset, bool ZeroMemset,
1848 MachineFunction &MF) const {
1849 const Function *F = MF.getFunction();
1850 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1852 (!Subtarget.isUnalignedMem16Slow() ||
1853 ((DstAlign == 0 || DstAlign >= 16) &&
1854 (SrcAlign == 0 || SrcAlign >= 16)))) {
1855 // FIXME: Check if unaligned 32-byte accesses are slow.
1856 if (Size >= 32 && Subtarget.hasAVX()) {
1857 // Although this isn't a well-supported type for AVX1, we'll let
1858 // legalization and shuffle lowering produce the optimal codegen. If we
1859 // choose an optimal type with a vector element larger than a byte,
1860 // getMemsetStores() may create an intermediate splat (using an integer
1861 // multiply) before we splat as a vector.
1864 if (Subtarget.hasSSE2())
1866 // TODO: Can SSE1 handle a byte vector?
1867 if (Subtarget.hasSSE1())
1869 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1870 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1871 // Do not use f64 to lower memcpy if source is string constant. It's
1872 // better to use i32 to avoid the loads.
1873 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1874 // The gymnastics of splatting a byte value into an XMM register and then
1875 // only using 8-byte stores (because this is a CPU with slow unaligned
1876 // 16-byte accesses) makes that a loser.
1880 // This is a compromise. If we reach here, unaligned accesses may be slow on
1881 // this target. However, creating smaller, aligned accesses could be even
1882 // slower and would certainly be a lot more code.
1883 if (Subtarget.is64Bit() && Size >= 8)
1888 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1890 return X86ScalarSSEf32;
1891 else if (VT == MVT::f64)
1892 return X86ScalarSSEf64;
1897 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1902 switch (VT.getSizeInBits()) {
1904 // 8-byte and under are always assumed to be fast.
1908 *Fast = !Subtarget.isUnalignedMem16Slow();
1911 *Fast = !Subtarget.isUnalignedMem32Slow();
1913 // TODO: What about AVX-512 (512-bit) accesses?
1916 // Misaligned accesses of any size are always allowed.
1920 /// Return the entry encoding for a jump table in the
1921 /// current function. The returned value is a member of the
1922 /// MachineJumpTableInfo::JTEntryKind enum.
1923 unsigned X86TargetLowering::getJumpTableEncoding() const {
1924 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1926 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1927 return MachineJumpTableInfo::EK_Custom32;
1929 // Otherwise, use the normal jump table encoding heuristics.
1930 return TargetLowering::getJumpTableEncoding();
1933 bool X86TargetLowering::useSoftFloat() const {
1934 return Subtarget.useSoftFloat();
1938 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1939 const MachineBasicBlock *MBB,
1940 unsigned uid,MCContext &Ctx) const{
1941 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1942 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1944 return MCSymbolRefExpr::create(MBB->getSymbol(),
1945 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1948 /// Returns relocation base for the given PIC jumptable.
1949 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1950 SelectionDAG &DAG) const {
1951 if (!Subtarget.is64Bit())
1952 // This doesn't have SDLoc associated with it, but is not really the
1953 // same as a Register.
1954 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1955 getPointerTy(DAG.getDataLayout()));
1959 /// This returns the relocation base for the given PIC jumptable,
1960 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1961 const MCExpr *X86TargetLowering::
1962 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1963 MCContext &Ctx) const {
1964 // X86-64 uses RIP relative addressing based on the jump table label.
1965 if (Subtarget.isPICStyleRIPRel())
1966 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1968 // Otherwise, the reference is relative to the PIC base.
1969 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1972 std::pair<const TargetRegisterClass *, uint8_t>
1973 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1975 const TargetRegisterClass *RRC = nullptr;
1977 switch (VT.SimpleTy) {
1979 return TargetLowering::findRepresentativeClass(TRI, VT);
1980 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1981 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1984 RRC = &X86::VR64RegClass;
1986 case MVT::f32: case MVT::f64:
1987 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1988 case MVT::v4f32: case MVT::v2f64:
1989 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1990 case MVT::v8f32: case MVT::v4f64:
1991 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1992 case MVT::v16f32: case MVT::v8f64:
1993 RRC = &X86::VR128XRegClass;
1996 return std::make_pair(RRC, Cost);
1999 unsigned X86TargetLowering::getAddressSpace() const {
2000 if (Subtarget.is64Bit())
2001 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2005 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2006 // glibc has a special slot for the stack guard in tcbhead_t, use it instead
2007 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
2008 if (!Subtarget.isTargetGlibc())
2009 return TargetLowering::getIRStackGuard(IRB);
2011 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
2013 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2014 unsigned AddressSpace = getAddressSpace();
2015 return ConstantExpr::getIntToPtr(
2016 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2017 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2020 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2021 // MSVC CRT provides functionalities for stack protection.
2022 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2023 // MSVC CRT has a global variable holding security cookie.
2024 M.getOrInsertGlobal("__security_cookie",
2025 Type::getInt8PtrTy(M.getContext()));
2027 // MSVC CRT has a function to validate security cookie.
2028 auto *SecurityCheckCookie = cast<Function>(
2029 M.getOrInsertFunction("__security_check_cookie",
2030 Type::getVoidTy(M.getContext()),
2031 Type::getInt8PtrTy(M.getContext()), nullptr));
2032 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2033 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2036 // glibc has a special slot for the stack guard.
2037 if (Subtarget.isTargetGlibc())
2039 TargetLowering::insertSSPDeclarations(M);
2042 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2043 // MSVC CRT has a global variable holding security cookie.
2044 if (Subtarget.getTargetTriple().isOSMSVCRT())
2045 return M.getGlobalVariable("__security_cookie");
2046 return TargetLowering::getSDagStackGuard(M);
2049 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2050 // MSVC CRT has a function to validate security cookie.
2051 if (Subtarget.getTargetTriple().isOSMSVCRT())
2052 return M.getFunction("__security_check_cookie");
2053 return TargetLowering::getSSPStackGuardCheck(M);
2056 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2057 if (Subtarget.getTargetTriple().isOSContiki())
2058 return getDefaultSafeStackPointerLocation(IRB, false);
2060 if (!Subtarget.isTargetAndroid())
2061 return TargetLowering::getSafeStackPointerLocation(IRB);
2063 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2064 // definition of TLS_SLOT_SAFESTACK in
2065 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2066 unsigned AddressSpace, Offset;
2068 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2070 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2071 AddressSpace = getAddressSpace();
2072 return ConstantExpr::getIntToPtr(
2073 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2074 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2077 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2078 unsigned DestAS) const {
2079 assert(SrcAS != DestAS && "Expected different address spaces!");
2081 return SrcAS < 256 && DestAS < 256;
2084 //===----------------------------------------------------------------------===//
2085 // Return Value Calling Convention Implementation
2086 //===----------------------------------------------------------------------===//
2088 #include "X86GenCallingConv.inc"
2090 bool X86TargetLowering::CanLowerReturn(
2091 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2092 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2093 SmallVector<CCValAssign, 16> RVLocs;
2094 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2095 return CCInfo.CheckReturn(Outs, RetCC_X86);
2098 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2099 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2103 /// Lowers masks values (v*i1) to the local register values
2104 /// \returns DAG node after lowering to register type
2105 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2106 const SDLoc &Dl, SelectionDAG &DAG) {
2107 EVT ValVT = ValArg.getValueType();
2109 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2110 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2111 // Two stage lowering might be required
2112 // bitcast: v8i1 -> i8 / v16i1 -> i16
2113 // anyextend: i8 -> i32 / i16 -> i32
2114 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2115 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2116 if (ValLoc == MVT::i32)
2117 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2119 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2120 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2121 // One stage lowering is required
2122 // bitcast: v32i1 -> i32 / v64i1 -> i64
2123 return DAG.getBitcast(ValLoc, ValArg);
2125 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2128 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2129 static void Passv64i1ArgInRegs(
2130 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2131 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2132 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2133 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2134 "Expected AVX512BW or AVX512BMI target!");
2135 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2136 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2137 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2138 "The value should reside in two registers");
2140 // Before splitting the value we cast it to i64
2141 Arg = DAG.getBitcast(MVT::i64, Arg);
2143 // Splitting the value into two i32 types
2145 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2146 DAG.getConstant(0, Dl, MVT::i32));
2147 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2148 DAG.getConstant(1, Dl, MVT::i32));
2150 // Attach the two i32 types into corresponding registers
2151 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2152 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2156 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2158 const SmallVectorImpl<ISD::OutputArg> &Outs,
2159 const SmallVectorImpl<SDValue> &OutVals,
2160 const SDLoc &dl, SelectionDAG &DAG) const {
2161 MachineFunction &MF = DAG.getMachineFunction();
2162 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2164 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2165 report_fatal_error("X86 interrupts may not return any value");
2167 SmallVector<CCValAssign, 16> RVLocs;
2168 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2169 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2172 SmallVector<SDValue, 6> RetOps;
2173 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2174 // Operand #1 = Bytes To Pop
2175 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2178 // Copy the result values into the output registers.
2179 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2181 CCValAssign &VA = RVLocs[I];
2182 assert(VA.isRegLoc() && "Can only return in registers!");
2183 SDValue ValToCopy = OutVals[OutsIndex];
2184 EVT ValVT = ValToCopy.getValueType();
2186 // Promote values to the appropriate types.
2187 if (VA.getLocInfo() == CCValAssign::SExt)
2188 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2189 else if (VA.getLocInfo() == CCValAssign::ZExt)
2190 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2191 else if (VA.getLocInfo() == CCValAssign::AExt) {
2192 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2193 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2195 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2197 else if (VA.getLocInfo() == CCValAssign::BCvt)
2198 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2200 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2201 "Unexpected FP-extend for return value.");
2203 // If this is x86-64, and we disabled SSE, we can't return FP values,
2204 // or SSE or MMX vectors.
2205 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2206 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2207 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2208 report_fatal_error("SSE register return with SSE disabled");
2210 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2211 // llvm-gcc has never done it right and no one has noticed, so this
2212 // should be OK for now.
2213 if (ValVT == MVT::f64 &&
2214 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2215 report_fatal_error("SSE2 register return with SSE2 disabled");
2217 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2218 // the RET instruction and handled by the FP Stackifier.
2219 if (VA.getLocReg() == X86::FP0 ||
2220 VA.getLocReg() == X86::FP1) {
2221 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2222 // change the value to the FP stack register class.
2223 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2224 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2225 RetOps.push_back(ValToCopy);
2226 // Don't emit a copytoreg.
2230 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2231 // which is returned in RAX / RDX.
2232 if (Subtarget.is64Bit()) {
2233 if (ValVT == MVT::x86mmx) {
2234 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2235 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2236 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2238 // If we don't have SSE2 available, convert to v4f32 so the generated
2239 // register is legal.
2240 if (!Subtarget.hasSSE2())
2241 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2246 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2248 if (VA.needsCustom()) {
2249 assert(VA.getValVT() == MVT::v64i1 &&
2250 "Currently the only custom case is when we split v64i1 to 2 regs");
2252 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2255 assert(2 == RegsToPass.size() &&
2256 "Expecting two registers after Pass64BitArgInRegs");
2258 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2261 // Add nodes to the DAG and add the values into the RetOps list
2262 for (auto &Reg : RegsToPass) {
2263 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2264 Flag = Chain.getValue(1);
2265 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2269 // Swift calling convention does not require we copy the sret argument
2270 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2272 // All x86 ABIs require that for returning structs by value we copy
2273 // the sret argument into %rax/%eax (depending on ABI) for the return.
2274 // We saved the argument into a virtual register in the entry block,
2275 // so now we copy the value out and into %rax/%eax.
2277 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2278 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2279 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2280 // either case FuncInfo->setSRetReturnReg() will have been called.
2281 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2282 // When we have both sret and another return value, we should use the
2283 // original Chain stored in RetOps[0], instead of the current Chain updated
2284 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2286 // For the case of sret and another return value, we have
2287 // Chain_0 at the function entry
2288 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2289 // If we use Chain_1 in getCopyFromReg, we will have
2290 // Val = getCopyFromReg(Chain_1)
2291 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2293 // getCopyToReg(Chain_0) will be glued together with
2294 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2295 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2296 // Data dependency from Unit B to Unit A due to usage of Val in
2297 // getCopyToReg(Chain_1, Val)
2298 // Chain dependency from Unit A to Unit B
2300 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2301 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2302 getPointerTy(MF.getDataLayout()));
2305 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2306 X86::RAX : X86::EAX;
2307 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2308 Flag = Chain.getValue(1);
2310 // RAX/EAX now acts like a return value.
2312 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2315 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2316 const MCPhysReg *I =
2317 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2320 if (X86::GR64RegClass.contains(*I))
2321 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2323 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2327 RetOps[0] = Chain; // Update chain.
2329 // Add the flag if we have it.
2331 RetOps.push_back(Flag);
2333 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2334 if (CallConv == CallingConv::X86_INTR)
2335 opcode = X86ISD::IRET;
2336 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2339 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2340 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2343 SDValue TCChain = Chain;
2344 SDNode *Copy = *N->use_begin();
2345 if (Copy->getOpcode() == ISD::CopyToReg) {
2346 // If the copy has a glue operand, we conservatively assume it isn't safe to
2347 // perform a tail call.
2348 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2350 TCChain = Copy->getOperand(0);
2351 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2354 bool HasRet = false;
2355 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2357 if (UI->getOpcode() != X86ISD::RET_FLAG)
2359 // If we are returning more than one value, we can definitely
2360 // not make a tail call see PR19530
2361 if (UI->getNumOperands() > 4)
2363 if (UI->getNumOperands() == 4 &&
2364 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2376 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2377 ISD::NodeType ExtendKind) const {
2378 MVT ReturnMVT = MVT::i32;
2380 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2381 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2382 // The ABI does not require i1, i8 or i16 to be extended.
2384 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2385 // always extending i8/i16 return values, so keep doing that for now.
2387 ReturnMVT = MVT::i8;
2390 EVT MinVT = getRegisterType(Context, ReturnMVT);
2391 return VT.bitsLT(MinVT) ? MinVT : VT;
2394 /// Reads two 32 bit registers and creates a 64 bit mask value.
2395 /// \param VA The current 32 bit value that need to be assigned.
2396 /// \param NextVA The next 32 bit value that need to be assigned.
2397 /// \param Root The parent DAG node.
2398 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2399 /// glue purposes. In the case the DAG is already using
2400 /// physical register instead of virtual, we should glue
2401 /// our new SDValue to InFlag SDvalue.
2402 /// \return a new SDvalue of size 64bit.
2403 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2404 SDValue &Root, SelectionDAG &DAG,
2405 const SDLoc &Dl, const X86Subtarget &Subtarget,
2406 SDValue *InFlag = nullptr) {
2407 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2408 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2409 assert(VA.getValVT() == MVT::v64i1 &&
2410 "Expecting first location of 64 bit width type");
2411 assert(NextVA.getValVT() == VA.getValVT() &&
2412 "The locations should have the same type");
2413 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2414 "The values should reside in two registers");
2418 SDValue ArgValueLo, ArgValueHi;
2420 MachineFunction &MF = DAG.getMachineFunction();
2421 const TargetRegisterClass *RC = &X86::GR32RegClass;
2423 // Read a 32 bit value from the registers
2424 if (nullptr == InFlag) {
2425 // When no physical register is present,
2426 // create an intermediate virtual register
2427 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2428 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2429 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2430 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2432 // When a physical register is available read the value from it and glue
2433 // the reads together.
2435 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2436 *InFlag = ArgValueLo.getValue(2);
2438 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2439 *InFlag = ArgValueHi.getValue(2);
2442 // Convert the i32 type into v32i1 type
2443 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2445 // Convert the i32 type into v32i1 type
2446 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2448 // Concantenate the two values together
2449 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2452 /// The function will lower a register of various sizes (8/16/32/64)
2453 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2454 /// \returns a DAG node contains the operand after lowering to mask type.
2455 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2456 const EVT &ValLoc, const SDLoc &Dl,
2457 SelectionDAG &DAG) {
2458 SDValue ValReturned = ValArg;
2460 if (ValVT == MVT::v64i1) {
2461 // In 32 bit machine, this case is handled by getv64i1Argument
2462 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2463 // In 64 bit machine, There is no need to truncate the value only bitcast
2466 switch (ValVT.getSimpleVT().SimpleTy) {
2477 llvm_unreachable("Expecting a vector of i1 types");
2480 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2483 return DAG.getBitcast(ValVT, ValReturned);
2486 /// Lower the result values of a call into the
2487 /// appropriate copies out of appropriate physical registers.
2489 SDValue X86TargetLowering::LowerCallResult(
2490 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2491 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2492 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2494 // Assign locations to each value returned by this call.
2495 SmallVector<CCValAssign, 16> RVLocs;
2496 bool Is64Bit = Subtarget.is64Bit();
2497 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2499 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2501 // Copy all of the result registers out of their specified physreg.
2502 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2504 CCValAssign &VA = RVLocs[I];
2505 EVT CopyVT = VA.getLocVT();
2507 // If this is x86-64, and we disabled SSE, we can't return FP values
2508 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2509 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2510 report_fatal_error("SSE register return with SSE disabled");
2513 // If we prefer to use the value in xmm registers, copy it out as f80 and
2514 // use a truncate to move it from fp stack reg to xmm reg.
2515 bool RoundAfterCopy = false;
2516 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2517 isScalarFPTypeInSSEReg(VA.getValVT())) {
2518 if (!Subtarget.hasX87())
2519 report_fatal_error("X87 register return with X87 disabled");
2521 RoundAfterCopy = (CopyVT != VA.getLocVT());
2525 if (VA.needsCustom()) {
2526 assert(VA.getValVT() == MVT::v64i1 &&
2527 "Currently the only custom case is when we split v64i1 to 2 regs");
2529 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2531 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2533 Val = Chain.getValue(0);
2534 InFlag = Chain.getValue(2);
2538 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2539 // This truncation won't change the value.
2540 DAG.getIntPtrConstant(1, dl));
2542 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2543 if (VA.getValVT().isVector() &&
2544 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2545 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2546 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2547 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2549 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2552 InVals.push_back(Val);
2558 //===----------------------------------------------------------------------===//
2559 // C & StdCall & Fast Calling Convention implementation
2560 //===----------------------------------------------------------------------===//
2561 // StdCall calling convention seems to be standard for many Windows' API
2562 // routines and around. It differs from C calling convention just a little:
2563 // callee should clean up the stack, not caller. Symbols should be also
2564 // decorated in some fancy way :) It doesn't support any vector arguments.
2565 // For info on fast calling convention see Fast Calling Convention (tail call)
2566 // implementation LowerX86_32FastCCCallTo.
2568 /// CallIsStructReturn - Determines whether a call uses struct return
2570 enum StructReturnType {
2575 static StructReturnType
2576 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2578 return NotStructReturn;
2580 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2581 if (!Flags.isSRet())
2582 return NotStructReturn;
2583 if (Flags.isInReg() || IsMCU)
2584 return RegStructReturn;
2585 return StackStructReturn;
2588 /// Determines whether a function uses struct return semantics.
2589 static StructReturnType
2590 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2592 return NotStructReturn;
2594 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2595 if (!Flags.isSRet())
2596 return NotStructReturn;
2597 if (Flags.isInReg() || IsMCU)
2598 return RegStructReturn;
2599 return StackStructReturn;
2602 /// Make a copy of an aggregate at address specified by "Src" to address
2603 /// "Dst" with size and alignment information specified by the specific
2604 /// parameter attribute. The copy will be passed as a byval function parameter.
2605 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2606 SDValue Chain, ISD::ArgFlagsTy Flags,
2607 SelectionDAG &DAG, const SDLoc &dl) {
2608 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2610 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2611 /*isVolatile*/false, /*AlwaysInline=*/true,
2612 /*isTailCall*/false,
2613 MachinePointerInfo(), MachinePointerInfo());
2616 /// Return true if the calling convention is one that we can guarantee TCO for.
2617 static bool canGuaranteeTCO(CallingConv::ID CC) {
2618 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2619 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2620 CC == CallingConv::HHVM);
2623 /// Return true if we might ever do TCO for calls with this calling convention.
2624 static bool mayTailCallThisCC(CallingConv::ID CC) {
2626 // C calling conventions:
2627 case CallingConv::C:
2628 case CallingConv::X86_64_Win64:
2629 case CallingConv::X86_64_SysV:
2630 // Callee pop conventions:
2631 case CallingConv::X86_ThisCall:
2632 case CallingConv::X86_StdCall:
2633 case CallingConv::X86_VectorCall:
2634 case CallingConv::X86_FastCall:
2637 return canGuaranteeTCO(CC);
2641 /// Return true if the function is being made into a tailcall target by
2642 /// changing its ABI.
2643 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2644 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2647 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2649 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2650 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2654 CallingConv::ID CalleeCC = CS.getCallingConv();
2655 if (!mayTailCallThisCC(CalleeCC))
2662 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2663 const SmallVectorImpl<ISD::InputArg> &Ins,
2664 const SDLoc &dl, SelectionDAG &DAG,
2665 const CCValAssign &VA,
2666 MachineFrameInfo &MFI, unsigned i) const {
2667 // Create the nodes corresponding to a load from this parameter slot.
2668 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2669 bool AlwaysUseMutable = shouldGuaranteeTCO(
2670 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2671 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2674 // If value is passed by pointer we have address passed instead of the value
2675 // itself. No need to extend if the mask value and location share the same
2677 bool ExtendedInMem =
2678 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2679 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2681 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2682 ValVT = VA.getLocVT();
2684 ValVT = VA.getValVT();
2686 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2687 // taken by a return address.
2689 if (CallConv == CallingConv::X86_INTR) {
2690 const X86Subtarget& Subtarget =
2691 static_cast<const X86Subtarget&>(DAG.getSubtarget());
2692 // X86 interrupts may take one or two arguments.
2693 // On the stack there will be no return address as in regular call.
2694 // Offset of last argument need to be set to -4/-8 bytes.
2695 // Where offset of the first argument out of two, should be set to 0 bytes.
2696 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2699 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2700 // changed with more analysis.
2701 // In case of tail call optimization mark all arguments mutable. Since they
2702 // could be overwritten by lowering of arguments in case of a tail call.
2703 if (Flags.isByVal()) {
2704 unsigned Bytes = Flags.getByValSize();
2705 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2706 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2707 // Adjust SP offset of interrupt parameter.
2708 if (CallConv == CallingConv::X86_INTR) {
2709 MFI.setObjectOffset(FI, Offset);
2711 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2713 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
2714 VA.getLocMemOffset(), isImmutable);
2716 // Set SExt or ZExt flag.
2717 if (VA.getLocInfo() == CCValAssign::ZExt) {
2718 MFI.setObjectZExt(FI, true);
2719 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2720 MFI.setObjectSExt(FI, true);
2723 // Adjust SP offset of interrupt parameter.
2724 if (CallConv == CallingConv::X86_INTR) {
2725 MFI.setObjectOffset(FI, Offset);
2728 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2729 SDValue Val = DAG.getLoad(
2730 ValVT, dl, Chain, FIN,
2731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2732 return ExtendedInMem ?
2733 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2737 // FIXME: Get this from tablegen.
2738 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2739 const X86Subtarget &Subtarget) {
2740 assert(Subtarget.is64Bit());
2742 if (Subtarget.isCallingConvWin64(CallConv)) {
2743 static const MCPhysReg GPR64ArgRegsWin64[] = {
2744 X86::RCX, X86::RDX, X86::R8, X86::R9
2746 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2749 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2750 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2752 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2755 // FIXME: Get this from tablegen.
2756 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2757 CallingConv::ID CallConv,
2758 const X86Subtarget &Subtarget) {
2759 assert(Subtarget.is64Bit());
2760 if (Subtarget.isCallingConvWin64(CallConv)) {
2761 // The XMM registers which might contain var arg parameters are shadowed
2762 // in their paired GPR. So we only need to save the GPR to their home
2764 // TODO: __vectorcall will change this.
2768 const Function *Fn = MF.getFunction();
2769 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2770 bool isSoftFloat = Subtarget.useSoftFloat();
2771 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2772 "SSE register cannot be used when SSE is disabled!");
2773 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2774 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2778 static const MCPhysReg XMMArgRegs64Bit[] = {
2779 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2780 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2782 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2785 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2786 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2787 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2788 return A.getValNo() < B.getValNo();
2792 SDValue X86TargetLowering::LowerFormalArguments(
2793 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2794 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2795 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2796 MachineFunction &MF = DAG.getMachineFunction();
2797 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2798 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2800 const Function *Fn = MF.getFunction();
2801 if (Fn->hasExternalLinkage() &&
2802 Subtarget.isTargetCygMing() &&
2803 Fn->getName() == "main")
2804 FuncInfo->setForceFramePointer(true);
2806 MachineFrameInfo &MFI = MF.getFrameInfo();
2807 bool Is64Bit = Subtarget.is64Bit();
2808 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2811 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2812 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2814 if (CallConv == CallingConv::X86_INTR) {
2815 bool isLegal = Ins.size() == 1 ||
2816 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2817 (!Is64Bit && Ins[1].VT == MVT::i32)));
2819 report_fatal_error("X86 interrupts may take one or two arguments");
2822 // Assign locations to all of the incoming arguments.
2823 SmallVector<CCValAssign, 16> ArgLocs;
2824 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2826 // Allocate shadow area for Win64.
2828 CCInfo.AllocateStack(32, 8);
2830 CCInfo.AnalyzeArguments(Ins, CC_X86);
2832 // In vectorcall calling convention a second pass is required for the HVA
2834 if (CallingConv::X86_VectorCall == CallConv) {
2835 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2838 // The next loop assumes that the locations are in the same order of the
2840 if (!isSortedByValueNo(ArgLocs))
2841 llvm_unreachable("Argument Location list must be sorted before lowering");
2844 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2846 assert(InsIndex < Ins.size() && "Invalid Ins index");
2847 CCValAssign &VA = ArgLocs[I];
2849 if (VA.isRegLoc()) {
2850 EVT RegVT = VA.getLocVT();
2851 if (VA.needsCustom()) {
2853 VA.getValVT() == MVT::v64i1 &&
2854 "Currently the only custom case is when we split v64i1 to 2 regs");
2856 // v64i1 values, in regcall calling convention, that are
2857 // compiled to 32 bit arch, are splited up into two registers.
2859 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2861 const TargetRegisterClass *RC;
2862 if (RegVT == MVT::i32)
2863 RC = &X86::GR32RegClass;
2864 else if (Is64Bit && RegVT == MVT::i64)
2865 RC = &X86::GR64RegClass;
2866 else if (RegVT == MVT::f32)
2867 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2868 else if (RegVT == MVT::f64)
2869 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2870 else if (RegVT == MVT::f80)
2871 RC = &X86::RFP80RegClass;
2872 else if (RegVT == MVT::f128)
2873 RC = &X86::FR128RegClass;
2874 else if (RegVT.is512BitVector())
2875 RC = &X86::VR512RegClass;
2876 else if (RegVT.is256BitVector())
2877 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2878 else if (RegVT.is128BitVector())
2879 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2880 else if (RegVT == MVT::x86mmx)
2881 RC = &X86::VR64RegClass;
2882 else if (RegVT == MVT::i1)
2883 RC = &X86::VK1RegClass;
2884 else if (RegVT == MVT::v8i1)
2885 RC = &X86::VK8RegClass;
2886 else if (RegVT == MVT::v16i1)
2887 RC = &X86::VK16RegClass;
2888 else if (RegVT == MVT::v32i1)
2889 RC = &X86::VK32RegClass;
2890 else if (RegVT == MVT::v64i1)
2891 RC = &X86::VK64RegClass;
2893 llvm_unreachable("Unknown argument type!");
2895 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2896 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2899 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2900 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2902 if (VA.getLocInfo() == CCValAssign::SExt)
2903 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2904 DAG.getValueType(VA.getValVT()));
2905 else if (VA.getLocInfo() == CCValAssign::ZExt)
2906 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2907 DAG.getValueType(VA.getValVT()));
2908 else if (VA.getLocInfo() == CCValAssign::BCvt)
2909 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2911 if (VA.isExtInLoc()) {
2912 // Handle MMX values passed in XMM regs.
2913 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2914 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2915 else if (VA.getValVT().isVector() &&
2916 VA.getValVT().getScalarType() == MVT::i1 &&
2917 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2918 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2919 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2920 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2922 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2925 assert(VA.isMemLoc());
2927 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
2930 // If value is passed via pointer - do a load.
2931 if (VA.getLocInfo() == CCValAssign::Indirect)
2933 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
2935 InVals.push_back(ArgValue);
2938 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
2939 // Swift calling convention does not require we copy the sret argument
2940 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2941 if (CallConv == CallingConv::Swift)
2944 // All x86 ABIs require that for returning structs by value we copy the
2945 // sret argument into %rax/%eax (depending on ABI) for the return. Save
2946 // the argument into a virtual register so that we can access it from the
2948 if (Ins[I].Flags.isSRet()) {
2949 unsigned Reg = FuncInfo->getSRetReturnReg();
2951 MVT PtrTy = getPointerTy(DAG.getDataLayout());
2952 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2953 FuncInfo->setSRetReturnReg(Reg);
2955 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
2956 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2961 unsigned StackSize = CCInfo.getNextStackOffset();
2962 // Align stack specially for tail calls.
2963 if (shouldGuaranteeTCO(CallConv,
2964 MF.getTarget().Options.GuaranteedTailCallOpt))
2965 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2967 // If the function takes variable number of arguments, make a frame index for
2968 // the start of the first vararg value... for expansion of llvm.va_start. We
2969 // can skip this if there are no va_start calls.
2970 if (MFI.hasVAStart() &&
2971 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2972 CallConv != CallingConv::X86_ThisCall))) {
2973 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
2976 // Figure out if XMM registers are in use.
2977 assert(!(Subtarget.useSoftFloat() &&
2978 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2979 "SSE register cannot be used when SSE is disabled!");
2981 // 64-bit calling conventions support varargs and register parameters, so we
2982 // have to do extra work to spill them in the prologue.
2983 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
2984 // Find the first unallocated argument registers.
2985 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2986 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2987 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2988 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2989 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2990 "SSE register cannot be used when SSE is disabled!");
2992 // Gather all the live in physical registers.
2993 SmallVector<SDValue, 6> LiveGPRs;
2994 SmallVector<SDValue, 8> LiveXMMRegs;
2996 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2997 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2999 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3001 if (!ArgXMMs.empty()) {
3002 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3003 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3004 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3005 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3006 LiveXMMRegs.push_back(
3007 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3012 // Get to the caller-allocated home save location. Add 8 to account
3013 // for the return address.
3014 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3015 FuncInfo->setRegSaveFrameIndex(
3016 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3017 // Fixup to set vararg frame on shadow area (4 x i64).
3019 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3021 // For X86-64, if there are vararg parameters that are passed via
3022 // registers, then we must store them to their spots on the stack so
3023 // they may be loaded by dereferencing the result of va_next.
3024 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3025 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3026 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3027 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3030 // Store the integer parameter registers.
3031 SmallVector<SDValue, 8> MemOps;
3032 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3033 getPointerTy(DAG.getDataLayout()));
3034 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3035 for (SDValue Val : LiveGPRs) {
3036 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3037 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3039 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3040 MachinePointerInfo::getFixedStack(
3041 DAG.getMachineFunction(),
3042 FuncInfo->getRegSaveFrameIndex(), Offset));
3043 MemOps.push_back(Store);
3047 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3048 // Now store the XMM (fp + vector) parameter registers.
3049 SmallVector<SDValue, 12> SaveXMMOps;
3050 SaveXMMOps.push_back(Chain);
3051 SaveXMMOps.push_back(ALVal);
3052 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3053 FuncInfo->getRegSaveFrameIndex(), dl));
3054 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3055 FuncInfo->getVarArgsFPOffset(), dl));
3056 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3058 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3059 MVT::Other, SaveXMMOps));
3062 if (!MemOps.empty())
3063 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3066 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3067 // Find the largest legal vector type.
3068 MVT VecVT = MVT::Other;
3069 // FIXME: Only some x86_32 calling conventions support AVX512.
3070 if (Subtarget.hasAVX512() &&
3071 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3072 CallConv == CallingConv::Intel_OCL_BI)))
3073 VecVT = MVT::v16f32;
3074 else if (Subtarget.hasAVX())
3076 else if (Subtarget.hasSSE2())
3079 // We forward some GPRs and some vector types.
3080 SmallVector<MVT, 2> RegParmTypes;
3081 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3082 RegParmTypes.push_back(IntVT);
3083 if (VecVT != MVT::Other)
3084 RegParmTypes.push_back(VecVT);
3086 // Compute the set of forwarded registers. The rest are scratch.
3087 SmallVectorImpl<ForwardedRegister> &Forwards =
3088 FuncInfo->getForwardedMustTailRegParms();
3089 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3091 // Conservatively forward AL on x86_64, since it might be used for varargs.
3092 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3093 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3094 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3097 // Copy all forwards from physical to virtual registers.
3098 for (ForwardedRegister &F : Forwards) {
3099 // FIXME: Can we use a less constrained schedule?
3100 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3101 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3102 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3106 // Some CCs need callee pop.
3107 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3108 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3109 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3110 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3111 // X86 interrupts must pop the error code if present
3112 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
3114 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3115 // If this is an sret function, the return should pop the hidden pointer.
3116 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3117 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3118 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3119 FuncInfo->setBytesToPopOnReturn(4);
3123 // RegSaveFrameIndex is X86-64 only.
3124 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3125 if (CallConv == CallingConv::X86_FastCall ||
3126 CallConv == CallingConv::X86_ThisCall)
3127 // fastcc functions can't have varargs.
3128 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3131 FuncInfo->setArgumentStackSize(StackSize);
3133 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3134 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3135 if (Personality == EHPersonality::CoreCLR) {
3137 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3138 // that we'd prefer this slot be allocated towards the bottom of the frame
3139 // (i.e. near the stack pointer after allocating the frame). Every
3140 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3141 // offset from the bottom of this and each funclet's frame must be the
3142 // same, so the size of funclets' (mostly empty) frames is dictated by
3143 // how far this slot is from the bottom (since they allocate just enough
3144 // space to accommodate holding this slot at the correct offset).
3145 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3146 EHInfo->PSPSymFrameIdx = PSPSymFI;
3153 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3154 SDValue Arg, const SDLoc &dl,
3156 const CCValAssign &VA,
3157 ISD::ArgFlagsTy Flags) const {
3158 unsigned LocMemOffset = VA.getLocMemOffset();
3159 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3160 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3162 if (Flags.isByVal())
3163 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3165 return DAG.getStore(
3166 Chain, dl, Arg, PtrOff,
3167 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3170 /// Emit a load of return address if tail call
3171 /// optimization is performed and it is required.
3172 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3173 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3174 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3175 // Adjust the Return address stack slot.
3176 EVT VT = getPointerTy(DAG.getDataLayout());
3177 OutRetAddr = getReturnAddressFrameIndex(DAG);
3179 // Load the "old" Return address.
3180 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3181 return SDValue(OutRetAddr.getNode(), 1);
3184 /// Emit a store of the return address if tail call
3185 /// optimization is performed and it is required (FPDiff!=0).
3186 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3187 SDValue Chain, SDValue RetAddrFrIdx,
3188 EVT PtrVT, unsigned SlotSize,
3189 int FPDiff, const SDLoc &dl) {
3190 // Store the return address to the appropriate stack slot.
3191 if (!FPDiff) return Chain;
3192 // Calculate the new stack slot for the return address.
3193 int NewReturnAddrFI =
3194 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3196 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3197 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3198 MachinePointerInfo::getFixedStack(
3199 DAG.getMachineFunction(), NewReturnAddrFI));
3203 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3204 /// operation of specified width.
3205 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3207 unsigned NumElems = VT.getVectorNumElements();
3208 SmallVector<int, 8> Mask;
3209 Mask.push_back(NumElems);
3210 for (unsigned i = 1; i != NumElems; ++i)
3212 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3216 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3217 SmallVectorImpl<SDValue> &InVals) const {
3218 SelectionDAG &DAG = CLI.DAG;
3220 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3221 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3222 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3223 SDValue Chain = CLI.Chain;
3224 SDValue Callee = CLI.Callee;
3225 CallingConv::ID CallConv = CLI.CallConv;
3226 bool &isTailCall = CLI.IsTailCall;
3227 bool isVarArg = CLI.IsVarArg;
3229 MachineFunction &MF = DAG.getMachineFunction();
3230 bool Is64Bit = Subtarget.is64Bit();
3231 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3232 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3233 bool IsSibcall = false;
3234 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3235 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3237 if (CallConv == CallingConv::X86_INTR)
3238 report_fatal_error("X86 interrupts may not be called directly");
3240 if (Attr.getValueAsString() == "true")
3243 if (Subtarget.isPICStyleGOT() &&
3244 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3245 // If we are using a GOT, disable tail calls to external symbols with
3246 // default visibility. Tail calling such a symbol requires using a GOT
3247 // relocation, which forces early binding of the symbol. This breaks code
3248 // that require lazy function symbol resolution. Using musttail or
3249 // GuaranteedTailCallOpt will override this.
3250 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3251 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3252 G->getGlobal()->hasDefaultVisibility()))
3256 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3258 // Force this to be a tail call. The verifier rules are enough to ensure
3259 // that we can lower this successfully without moving the return address
3262 } else if (isTailCall) {
3263 // Check if it's really possible to do a tail call.
3264 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3265 isVarArg, SR != NotStructReturn,
3266 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3267 Outs, OutVals, Ins, DAG);
3269 // Sibcalls are automatically detected tailcalls which do not require
3271 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3278 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3279 "Var args not supported with calling convention fastcc, ghc or hipe");
3281 // Analyze operands of the call, assigning locations to each operand.
3282 SmallVector<CCValAssign, 16> ArgLocs;
3283 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3285 // Allocate shadow area for Win64.
3287 CCInfo.AllocateStack(32, 8);
3289 CCInfo.AnalyzeArguments(Outs, CC_X86);
3291 // In vectorcall calling convention a second pass is required for the HVA
3293 if (CallingConv::X86_VectorCall == CallConv) {
3294 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3297 // Get a count of how many bytes are to be pushed on the stack.
3298 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3300 // This is a sibcall. The memory operands are available in caller's
3301 // own caller's stack.
3303 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3304 canGuaranteeTCO(CallConv))
3305 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3308 if (isTailCall && !IsSibcall && !IsMustTail) {
3309 // Lower arguments at fp - stackoffset + fpdiff.
3310 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3312 FPDiff = NumBytesCallerPushed - NumBytes;
3314 // Set the delta of movement of the returnaddr stackslot.
3315 // But only set if delta is greater than previous delta.
3316 if (FPDiff < X86Info->getTCReturnAddrDelta())
3317 X86Info->setTCReturnAddrDelta(FPDiff);
3320 unsigned NumBytesToPush = NumBytes;
3321 unsigned NumBytesToPop = NumBytes;
3323 // If we have an inalloca argument, all stack space has already been allocated
3324 // for us and be right at the top of the stack. We don't support multiple
3325 // arguments passed in memory when using inalloca.
3326 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3328 if (!ArgLocs.back().isMemLoc())
3329 report_fatal_error("cannot use inalloca attribute on a register "
3331 if (ArgLocs.back().getLocMemOffset() != 0)
3332 report_fatal_error("any parameter with the inalloca attribute must be "
3333 "the only memory argument");
3337 Chain = DAG.getCALLSEQ_START(
3338 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3340 SDValue RetAddrFrIdx;
3341 // Load return address for tail calls.
3342 if (isTailCall && FPDiff)
3343 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3344 Is64Bit, FPDiff, dl);
3346 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3347 SmallVector<SDValue, 8> MemOpChains;
3350 // The next loop assumes that the locations are in the same order of the
3352 if (!isSortedByValueNo(ArgLocs))
3353 llvm_unreachable("Argument Location list must be sorted before lowering");
3355 // Walk the register/memloc assignments, inserting copies/loads. In the case
3356 // of tail call optimization arguments are handle later.
3357 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3358 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3360 assert(OutIndex < Outs.size() && "Invalid Out index");
3361 // Skip inalloca arguments, they have already been written.
3362 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3363 if (Flags.isInAlloca())
3366 CCValAssign &VA = ArgLocs[I];
3367 EVT RegVT = VA.getLocVT();
3368 SDValue Arg = OutVals[OutIndex];
3369 bool isByVal = Flags.isByVal();
3371 // Promote the value if needed.
3372 switch (VA.getLocInfo()) {
3373 default: llvm_unreachable("Unknown loc info!");
3374 case CCValAssign::Full: break;
3375 case CCValAssign::SExt:
3376 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3378 case CCValAssign::ZExt:
3379 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3381 case CCValAssign::AExt:
3382 if (Arg.getValueType().isVector() &&
3383 Arg.getValueType().getVectorElementType() == MVT::i1)
3384 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3385 else if (RegVT.is128BitVector()) {
3386 // Special case: passing MMX values in XMM registers.
3387 Arg = DAG.getBitcast(MVT::i64, Arg);
3388 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3389 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3391 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3393 case CCValAssign::BCvt:
3394 Arg = DAG.getBitcast(RegVT, Arg);
3396 case CCValAssign::Indirect: {
3397 // Store the argument.
3398 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3399 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3400 Chain = DAG.getStore(
3401 Chain, dl, Arg, SpillSlot,
3402 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3408 if (VA.needsCustom()) {
3409 assert(VA.getValVT() == MVT::v64i1 &&
3410 "Currently the only custom case is when we split v64i1 to 2 regs");
3411 // Split v64i1 value into two registers
3412 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3414 } else if (VA.isRegLoc()) {
3415 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3416 if (isVarArg && IsWin64) {
3417 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3418 // shadow reg if callee is a varargs function.
3419 unsigned ShadowReg = 0;
3420 switch (VA.getLocReg()) {
3421 case X86::XMM0: ShadowReg = X86::RCX; break;
3422 case X86::XMM1: ShadowReg = X86::RDX; break;
3423 case X86::XMM2: ShadowReg = X86::R8; break;
3424 case X86::XMM3: ShadowReg = X86::R9; break;
3427 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3429 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3430 assert(VA.isMemLoc());
3431 if (!StackPtr.getNode())
3432 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3433 getPointerTy(DAG.getDataLayout()));
3434 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3435 dl, DAG, VA, Flags));
3439 if (!MemOpChains.empty())
3440 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3442 if (Subtarget.isPICStyleGOT()) {
3443 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3446 RegsToPass.push_back(std::make_pair(
3447 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3448 getPointerTy(DAG.getDataLayout()))));
3450 // If we are tail calling and generating PIC/GOT style code load the
3451 // address of the callee into ECX. The value in ecx is used as target of
3452 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3453 // for tail calls on PIC/GOT architectures. Normally we would just put the
3454 // address of GOT into ebx and then call target@PLT. But for tail calls
3455 // ebx would be restored (since ebx is callee saved) before jumping to the
3458 // Note: The actual moving to ECX is done further down.
3459 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3460 if (G && !G->getGlobal()->hasLocalLinkage() &&
3461 G->getGlobal()->hasDefaultVisibility())
3462 Callee = LowerGlobalAddress(Callee, DAG);
3463 else if (isa<ExternalSymbolSDNode>(Callee))
3464 Callee = LowerExternalSymbol(Callee, DAG);
3468 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3469 // From AMD64 ABI document:
3470 // For calls that may call functions that use varargs or stdargs
3471 // (prototype-less calls or calls to functions containing ellipsis (...) in
3472 // the declaration) %al is used as hidden argument to specify the number
3473 // of SSE registers used. The contents of %al do not need to match exactly
3474 // the number of registers, but must be an ubound on the number of SSE
3475 // registers used and is in the range 0 - 8 inclusive.
3477 // Count the number of XMM registers allocated.
3478 static const MCPhysReg XMMArgRegs[] = {
3479 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3480 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3482 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3483 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3484 && "SSE registers cannot be used when SSE is disabled");
3486 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3487 DAG.getConstant(NumXMMRegs, dl,
3491 if (isVarArg && IsMustTail) {
3492 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3493 for (const auto &F : Forwards) {
3494 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3495 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3499 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3500 // don't need this because the eligibility check rejects calls that require
3501 // shuffling arguments passed in memory.
3502 if (!IsSibcall && isTailCall) {
3503 // Force all the incoming stack arguments to be loaded from the stack
3504 // before any new outgoing arguments are stored to the stack, because the
3505 // outgoing stack slots may alias the incoming argument stack slots, and
3506 // the alias isn't otherwise explicit. This is slightly more conservative
3507 // than necessary, because it means that each store effectively depends
3508 // on every argument instead of just those arguments it would clobber.
3509 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3511 SmallVector<SDValue, 8> MemOpChains2;
3514 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3516 CCValAssign &VA = ArgLocs[I];
3518 if (VA.isRegLoc()) {
3519 if (VA.needsCustom()) {
3520 assert((CallConv == CallingConv::X86_RegCall) &&
3521 "Expecting custome case only in regcall calling convention");
3522 // This means that we are in special case where one argument was
3523 // passed through two register locations - Skip the next location
3530 assert(VA.isMemLoc());
3531 SDValue Arg = OutVals[OutsIndex];
3532 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3533 // Skip inalloca arguments. They don't require any work.
3534 if (Flags.isInAlloca())
3536 // Create frame index.
3537 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3538 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3539 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3540 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3542 if (Flags.isByVal()) {
3543 // Copy relative to framepointer.
3544 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3545 if (!StackPtr.getNode())
3546 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3547 getPointerTy(DAG.getDataLayout()));
3548 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3551 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3555 // Store relative to framepointer.
3556 MemOpChains2.push_back(DAG.getStore(
3557 ArgChain, dl, Arg, FIN,
3558 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3562 if (!MemOpChains2.empty())
3563 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3565 // Store the return address to the appropriate stack slot.
3566 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3567 getPointerTy(DAG.getDataLayout()),
3568 RegInfo->getSlotSize(), FPDiff, dl);
3571 // Build a sequence of copy-to-reg nodes chained together with token chain
3572 // and flag operands which copy the outgoing args into registers.
3574 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3575 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3576 RegsToPass[i].second, InFlag);
3577 InFlag = Chain.getValue(1);
3580 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3581 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3582 // In the 64-bit large code model, we have to make all calls
3583 // through a register, since the call instruction's 32-bit
3584 // pc-relative offset may not be large enough to hold the whole
3586 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3587 // If the callee is a GlobalAddress node (quite common, every direct call
3588 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3590 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3592 // We should use extra load for direct calls to dllimported functions in
3594 const GlobalValue *GV = G->getGlobal();
3595 if (!GV->hasDLLImportStorageClass()) {
3596 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3598 Callee = DAG.getTargetGlobalAddress(
3599 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3601 if (OpFlags == X86II::MO_GOTPCREL) {
3603 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3604 getPointerTy(DAG.getDataLayout()), Callee);
3605 // Add extra indirection
3606 Callee = DAG.getLoad(
3607 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3608 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3611 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3612 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3613 unsigned char OpFlags =
3614 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3616 Callee = DAG.getTargetExternalSymbol(
3617 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3618 } else if (Subtarget.isTarget64BitILP32() &&
3619 Callee->getValueType(0) == MVT::i32) {
3620 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3621 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3624 // Returns a chain & a flag for retval copy to use.
3625 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3626 SmallVector<SDValue, 8> Ops;
3628 if (!IsSibcall && isTailCall) {
3629 Chain = DAG.getCALLSEQ_END(Chain,
3630 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3631 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3632 InFlag = Chain.getValue(1);
3635 Ops.push_back(Chain);
3636 Ops.push_back(Callee);
3639 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3641 // Add argument registers to the end of the list so that they are known live
3643 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3644 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3645 RegsToPass[i].second.getValueType()));
3647 // Add a register mask operand representing the call-preserved registers.
3648 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3649 assert(Mask && "Missing call preserved mask for calling convention");
3651 // If this is an invoke in a 32-bit function using a funclet-based
3652 // personality, assume the function clobbers all registers. If an exception
3653 // is thrown, the runtime will not restore CSRs.
3654 // FIXME: Model this more precisely so that we can register allocate across
3655 // the normal edge and spill and fill across the exceptional edge.
3656 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3657 const Function *CallerFn = MF.getFunction();
3658 EHPersonality Pers =
3659 CallerFn->hasPersonalityFn()
3660 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3661 : EHPersonality::Unknown;
3662 if (isFuncletEHPersonality(Pers))
3663 Mask = RegInfo->getNoPreservedMask();
3666 Ops.push_back(DAG.getRegisterMask(Mask));
3668 if (InFlag.getNode())
3669 Ops.push_back(InFlag);
3673 //// If this is the first return lowered for this function, add the regs
3674 //// to the liveout set for the function.
3675 // This isn't right, although it's probably harmless on x86; liveouts
3676 // should be computed from returns not tail calls. Consider a void
3677 // function making a tail call to a function returning int.
3678 MF.getFrameInfo().setHasTailCall();
3679 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3682 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3683 InFlag = Chain.getValue(1);
3685 // Create the CALLSEQ_END node.
3686 unsigned NumBytesForCalleeToPop;
3687 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3688 DAG.getTarget().Options.GuaranteedTailCallOpt))
3689 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3690 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3691 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3692 SR == StackStructReturn)
3693 // If this is a call to a struct-return function, the callee
3694 // pops the hidden struct pointer, so we have to push it back.
3695 // This is common for Darwin/X86, Linux & Mingw32 targets.
3696 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3697 NumBytesForCalleeToPop = 4;
3699 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3701 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3702 // No need to reset the stack after the call if the call doesn't return. To
3703 // make the MI verify, we'll pretend the callee does it for us.
3704 NumBytesForCalleeToPop = NumBytes;
3707 // Returns a flag for retval copy to use.
3709 Chain = DAG.getCALLSEQ_END(Chain,
3710 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3711 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3714 InFlag = Chain.getValue(1);
3717 // Handle result values, copying them out of physregs into vregs that we
3719 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3720 Ins, dl, DAG, InVals);
3723 //===----------------------------------------------------------------------===//
3724 // Fast Calling Convention (tail call) implementation
3725 //===----------------------------------------------------------------------===//
3727 // Like std call, callee cleans arguments, convention except that ECX is
3728 // reserved for storing the tail called function address. Only 2 registers are
3729 // free for argument passing (inreg). Tail call optimization is performed
3731 // * tailcallopt is enabled
3732 // * caller/callee are fastcc
3733 // On X86_64 architecture with GOT-style position independent code only local
3734 // (within module) calls are supported at the moment.
3735 // To keep the stack aligned according to platform abi the function
3736 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3737 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3738 // If a tail called function callee has more arguments than the caller the
3739 // caller needs to make sure that there is room to move the RETADDR to. This is
3740 // achieved by reserving an area the size of the argument delta right after the
3741 // original RETADDR, but before the saved framepointer or the spilled registers
3742 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3754 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3757 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3758 SelectionDAG& DAG) const {
3759 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3760 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3761 unsigned StackAlignment = TFI.getStackAlignment();
3762 uint64_t AlignMask = StackAlignment - 1;
3763 int64_t Offset = StackSize;
3764 unsigned SlotSize = RegInfo->getSlotSize();
3765 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3766 // Number smaller than 12 so just add the difference.
3767 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3769 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3770 Offset = ((~AlignMask) & Offset) + StackAlignment +
3771 (StackAlignment-SlotSize);
3776 /// Return true if the given stack call argument is already available in the
3777 /// same position (relatively) of the caller's incoming argument stack.
3779 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3780 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3781 const X86InstrInfo *TII, const CCValAssign &VA) {
3782 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3785 // Look through nodes that don't alter the bits of the incoming value.
3786 unsigned Op = Arg.getOpcode();
3787 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3788 Arg = Arg.getOperand(0);
3791 if (Op == ISD::TRUNCATE) {
3792 const SDValue &TruncInput = Arg.getOperand(0);
3793 if (TruncInput.getOpcode() == ISD::AssertZext &&
3794 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3795 Arg.getValueType()) {
3796 Arg = TruncInput.getOperand(0);
3804 if (Arg.getOpcode() == ISD::CopyFromReg) {
3805 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3806 if (!TargetRegisterInfo::isVirtualRegister(VR))
3808 MachineInstr *Def = MRI->getVRegDef(VR);
3811 if (!Flags.isByVal()) {
3812 if (!TII->isLoadFromStackSlot(*Def, FI))
3815 unsigned Opcode = Def->getOpcode();
3816 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3817 Opcode == X86::LEA64_32r) &&
3818 Def->getOperand(1).isFI()) {
3819 FI = Def->getOperand(1).getIndex();
3820 Bytes = Flags.getByValSize();
3824 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3825 if (Flags.isByVal())
3826 // ByVal argument is passed in as a pointer but it's now being
3827 // dereferenced. e.g.
3828 // define @foo(%struct.X* %A) {
3829 // tail call @bar(%struct.X* byval %A)
3832 SDValue Ptr = Ld->getBasePtr();
3833 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3836 FI = FINode->getIndex();
3837 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3838 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3839 FI = FINode->getIndex();
3840 Bytes = Flags.getByValSize();
3844 assert(FI != INT_MAX);
3845 if (!MFI.isFixedObjectIndex(FI))
3848 if (Offset != MFI.getObjectOffset(FI))
3851 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3852 // If the argument location is wider than the argument type, check that any
3853 // extension flags match.
3854 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3855 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3860 return Bytes == MFI.getObjectSize(FI);
3863 /// Check whether the call is eligible for tail call optimization. Targets
3864 /// that want to do tail call optimization should implement this function.
3865 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3866 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3867 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3868 const SmallVectorImpl<ISD::OutputArg> &Outs,
3869 const SmallVectorImpl<SDValue> &OutVals,
3870 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3871 if (!mayTailCallThisCC(CalleeCC))
3874 // If -tailcallopt is specified, make fastcc functions tail-callable.
3875 MachineFunction &MF = DAG.getMachineFunction();
3876 const Function *CallerF = MF.getFunction();
3878 // If the function return type is x86_fp80 and the callee return type is not,
3879 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3880 // perform a tailcall optimization here.
3881 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3884 CallingConv::ID CallerCC = CallerF->getCallingConv();
3885 bool CCMatch = CallerCC == CalleeCC;
3886 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3887 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3889 // Win64 functions have extra shadow space for argument homing. Don't do the
3890 // sibcall if the caller and callee have mismatched expectations for this
3892 if (IsCalleeWin64 != IsCallerWin64)
3895 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3896 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3901 // Look for obvious safe cases to perform tail call optimization that do not
3902 // require ABI changes. This is what gcc calls sibcall.
3904 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3905 // emit a special epilogue.
3906 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3907 if (RegInfo->needsStackRealignment(MF))
3910 // Also avoid sibcall optimization if either caller or callee uses struct
3911 // return semantics.
3912 if (isCalleeStructRet || isCallerStructRet)
3915 // Do not sibcall optimize vararg calls unless all arguments are passed via
3917 LLVMContext &C = *DAG.getContext();
3918 if (isVarArg && !Outs.empty()) {
3919 // Optimizing for varargs on Win64 is unlikely to be safe without
3920 // additional testing.
3921 if (IsCalleeWin64 || IsCallerWin64)
3924 SmallVector<CCValAssign, 16> ArgLocs;
3925 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3927 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3928 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3929 if (!ArgLocs[i].isRegLoc())
3933 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3934 // stack. Therefore, if it's not used by the call it is not safe to optimize
3935 // this into a sibcall.
3936 bool Unused = false;
3937 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3944 SmallVector<CCValAssign, 16> RVLocs;
3945 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3946 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3947 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3948 CCValAssign &VA = RVLocs[i];
3949 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3954 // Check that the call results are passed in the same way.
3955 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3956 RetCC_X86, RetCC_X86))
3958 // The callee has to preserve all registers the caller needs to preserve.
3959 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3960 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3962 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3963 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3967 unsigned StackArgsSize = 0;
3969 // If the callee takes no arguments then go on to check the results of the
3971 if (!Outs.empty()) {
3972 // Check if stack adjustment is needed. For now, do not do this if any
3973 // argument is passed on the stack.
3974 SmallVector<CCValAssign, 16> ArgLocs;
3975 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3977 // Allocate shadow area for Win64
3979 CCInfo.AllocateStack(32, 8);
3981 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3982 StackArgsSize = CCInfo.getNextStackOffset();
3984 if (CCInfo.getNextStackOffset()) {
3985 // Check if the arguments are already laid out in the right way as
3986 // the caller's fixed stack objects.
3987 MachineFrameInfo &MFI = MF.getFrameInfo();
3988 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3989 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3990 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3991 CCValAssign &VA = ArgLocs[i];
3992 SDValue Arg = OutVals[i];
3993 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3994 if (VA.getLocInfo() == CCValAssign::Indirect)
3996 if (!VA.isRegLoc()) {
3997 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4004 bool PositionIndependent = isPositionIndependent();
4005 // If the tailcall address may be in a register, then make sure it's
4006 // possible to register allocate for it. In 32-bit, the call address can
4007 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4008 // callee-saved registers are restored. These happen to be the same
4009 // registers used to pass 'inreg' arguments so watch out for those.
4010 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4011 !isa<ExternalSymbolSDNode>(Callee)) ||
4012 PositionIndependent)) {
4013 unsigned NumInRegs = 0;
4014 // In PIC we need an extra register to formulate the address computation
4016 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4018 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4019 CCValAssign &VA = ArgLocs[i];
4022 unsigned Reg = VA.getLocReg();
4025 case X86::EAX: case X86::EDX: case X86::ECX:
4026 if (++NumInRegs == MaxInRegs)
4033 const MachineRegisterInfo &MRI = MF.getRegInfo();
4034 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4038 bool CalleeWillPop =
4039 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4040 MF.getTarget().Options.GuaranteedTailCallOpt);
4042 if (unsigned BytesToPop =
4043 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4044 // If we have bytes to pop, the callee must pop them.
4045 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4046 if (!CalleePopMatches)
4048 } else if (CalleeWillPop && StackArgsSize > 0) {
4049 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4057 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4058 const TargetLibraryInfo *libInfo) const {
4059 return X86::createFastISel(funcInfo, libInfo);
4062 //===----------------------------------------------------------------------===//
4063 // Other Lowering Hooks
4064 //===----------------------------------------------------------------------===//
4066 static bool MayFoldLoad(SDValue Op) {
4067 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4070 static bool MayFoldIntoStore(SDValue Op) {
4071 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4074 static bool MayFoldIntoZeroExtend(SDValue Op) {
4075 if (Op.hasOneUse()) {
4076 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4077 return (ISD::ZERO_EXTEND == Opcode);
4082 static bool isTargetShuffle(unsigned Opcode) {
4084 default: return false;
4085 case X86ISD::BLENDI:
4086 case X86ISD::PSHUFB:
4087 case X86ISD::PSHUFD:
4088 case X86ISD::PSHUFHW:
4089 case X86ISD::PSHUFLW:
4091 case X86ISD::INSERTPS:
4092 case X86ISD::PALIGNR:
4093 case X86ISD::VSHLDQ:
4094 case X86ISD::VSRLDQ:
4095 case X86ISD::MOVLHPS:
4096 case X86ISD::MOVLHPD:
4097 case X86ISD::MOVHLPS:
4098 case X86ISD::MOVLPS:
4099 case X86ISD::MOVLPD:
4100 case X86ISD::MOVSHDUP:
4101 case X86ISD::MOVSLDUP:
4102 case X86ISD::MOVDDUP:
4105 case X86ISD::UNPCKL:
4106 case X86ISD::UNPCKH:
4107 case X86ISD::VBROADCAST:
4108 case X86ISD::VPERMILPI:
4109 case X86ISD::VPERMILPV:
4110 case X86ISD::VPERM2X128:
4111 case X86ISD::VPERMIL2:
4112 case X86ISD::VPERMI:
4113 case X86ISD::VPPERM:
4114 case X86ISD::VPERMV:
4115 case X86ISD::VPERMV3:
4116 case X86ISD::VPERMIV3:
4117 case X86ISD::VZEXT_MOVL:
4122 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4124 default: return false;
4126 case X86ISD::PSHUFB:
4127 case X86ISD::VPERMILPV:
4128 case X86ISD::VPERMIL2:
4129 case X86ISD::VPPERM:
4130 case X86ISD::VPERMV:
4131 case X86ISD::VPERMV3:
4132 case X86ISD::VPERMIV3:
4134 // 'Faux' Target Shuffles.
4140 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4141 MachineFunction &MF = DAG.getMachineFunction();
4142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4143 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4144 int ReturnAddrIndex = FuncInfo->getRAIndex();
4146 if (ReturnAddrIndex == 0) {
4147 // Set up a frame object for the return address.
4148 unsigned SlotSize = RegInfo->getSlotSize();
4149 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4152 FuncInfo->setRAIndex(ReturnAddrIndex);
4155 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4158 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4159 bool hasSymbolicDisplacement) {
4160 // Offset should fit into 32 bit immediate field.
4161 if (!isInt<32>(Offset))
4164 // If we don't have a symbolic displacement - we don't have any extra
4166 if (!hasSymbolicDisplacement)
4169 // FIXME: Some tweaks might be needed for medium code model.
4170 if (M != CodeModel::Small && M != CodeModel::Kernel)
4173 // For small code model we assume that latest object is 16MB before end of 31
4174 // bits boundary. We may also accept pretty large negative constants knowing
4175 // that all objects are in the positive half of address space.
4176 if (M == CodeModel::Small && Offset < 16*1024*1024)
4179 // For kernel code model we know that all object resist in the negative half
4180 // of 32bits address space. We may not accept negative offsets, since they may
4181 // be just off and we may accept pretty large positive ones.
4182 if (M == CodeModel::Kernel && Offset >= 0)
4188 /// Determines whether the callee is required to pop its own arguments.
4189 /// Callee pop is necessary to support tail calls.
4190 bool X86::isCalleePop(CallingConv::ID CallingConv,
4191 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4192 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4193 // can guarantee TCO.
4194 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4197 switch (CallingConv) {
4200 case CallingConv::X86_StdCall:
4201 case CallingConv::X86_FastCall:
4202 case CallingConv::X86_ThisCall:
4203 case CallingConv::X86_VectorCall:
4208 /// \brief Return true if the condition is an unsigned comparison operation.
4209 static bool isX86CCUnsigned(unsigned X86CC) {
4212 llvm_unreachable("Invalid integer condition!");
4228 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4229 switch (SetCCOpcode) {
4230 default: llvm_unreachable("Invalid integer condition!");
4231 case ISD::SETEQ: return X86::COND_E;
4232 case ISD::SETGT: return X86::COND_G;
4233 case ISD::SETGE: return X86::COND_GE;
4234 case ISD::SETLT: return X86::COND_L;
4235 case ISD::SETLE: return X86::COND_LE;
4236 case ISD::SETNE: return X86::COND_NE;
4237 case ISD::SETULT: return X86::COND_B;
4238 case ISD::SETUGT: return X86::COND_A;
4239 case ISD::SETULE: return X86::COND_BE;
4240 case ISD::SETUGE: return X86::COND_AE;
4244 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4245 /// condition code, returning the condition code and the LHS/RHS of the
4246 /// comparison to make.
4247 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4248 bool isFP, SDValue &LHS, SDValue &RHS,
4249 SelectionDAG &DAG) {
4251 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4252 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4253 // X > -1 -> X == 0, jump !sign.
4254 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4255 return X86::COND_NS;
4257 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4258 // X < 0 -> X == 0, jump on sign.
4261 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4263 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4264 return X86::COND_LE;
4268 return TranslateIntegerX86CC(SetCCOpcode);
4271 // First determine if it is required or is profitable to flip the operands.
4273 // If LHS is a foldable load, but RHS is not, flip the condition.
4274 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4275 !ISD::isNON_EXTLoad(RHS.getNode())) {
4276 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4277 std::swap(LHS, RHS);
4280 switch (SetCCOpcode) {
4286 std::swap(LHS, RHS);
4290 // On a floating point condition, the flags are set as follows:
4292 // 0 | 0 | 0 | X > Y
4293 // 0 | 0 | 1 | X < Y
4294 // 1 | 0 | 0 | X == Y
4295 // 1 | 1 | 1 | unordered
4296 switch (SetCCOpcode) {
4297 default: llvm_unreachable("Condcode should be pre-legalized away");
4299 case ISD::SETEQ: return X86::COND_E;
4300 case ISD::SETOLT: // flipped
4302 case ISD::SETGT: return X86::COND_A;
4303 case ISD::SETOLE: // flipped
4305 case ISD::SETGE: return X86::COND_AE;
4306 case ISD::SETUGT: // flipped
4308 case ISD::SETLT: return X86::COND_B;
4309 case ISD::SETUGE: // flipped
4311 case ISD::SETLE: return X86::COND_BE;
4313 case ISD::SETNE: return X86::COND_NE;
4314 case ISD::SETUO: return X86::COND_P;
4315 case ISD::SETO: return X86::COND_NP;
4317 case ISD::SETUNE: return X86::COND_INVALID;
4321 /// Is there a floating point cmov for the specific X86 condition code?
4322 /// Current x86 isa includes the following FP cmov instructions:
4323 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4324 static bool hasFPCMov(unsigned X86CC) {
4341 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4343 unsigned Intrinsic) const {
4345 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4349 Info.opc = ISD::INTRINSIC_W_CHAIN;
4350 Info.readMem = false;
4351 Info.writeMem = false;
4355 switch (IntrData->Type) {
4356 case EXPAND_FROM_MEM: {
4357 Info.ptrVal = I.getArgOperand(0);
4358 Info.memVT = MVT::getVT(I.getType());
4360 Info.readMem = true;
4363 case COMPRESS_TO_MEM: {
4364 Info.ptrVal = I.getArgOperand(0);
4365 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4367 Info.writeMem = true;
4370 case TRUNCATE_TO_MEM_VI8:
4371 case TRUNCATE_TO_MEM_VI16:
4372 case TRUNCATE_TO_MEM_VI32: {
4373 Info.ptrVal = I.getArgOperand(0);
4374 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4375 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4376 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4378 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4379 ScalarVT = MVT::i16;
4380 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4381 ScalarVT = MVT::i32;
4383 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4385 Info.writeMem = true;
4395 /// Returns true if the target can instruction select the
4396 /// specified FP immediate natively. If false, the legalizer will
4397 /// materialize the FP immediate as a load from a constant pool.
4398 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4399 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4400 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4406 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4407 ISD::LoadExtType ExtTy,
4409 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4410 // relocation target a movq or addq instruction: don't let the load shrink.
4411 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4412 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4413 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4414 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4418 /// \brief Returns true if it is beneficial to convert a load of a constant
4419 /// to just the constant itself.
4420 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4422 assert(Ty->isIntegerTy());
4424 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4425 if (BitSize == 0 || BitSize > 64)
4430 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4431 unsigned Index) const {
4432 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4435 return (Index == 0 || Index == ResVT.getVectorNumElements());
4438 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4439 // Speculate cttz only if we can directly use TZCNT.
4440 return Subtarget.hasBMI();
4443 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4444 // Speculate ctlz only if we can directly use LZCNT.
4445 return Subtarget.hasLZCNT();
4448 bool X86TargetLowering::isCtlzFast() const {
4449 return Subtarget.hasFastLZCNT();
4452 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4453 if (!Subtarget.hasBMI())
4456 // There are only 32-bit and 64-bit forms for 'andn'.
4457 EVT VT = Y.getValueType();
4458 if (VT != MVT::i32 && VT != MVT::i64)
4464 /// Val is the undef sentinel value or equal to the specified value.
4465 static bool isUndefOrEqual(int Val, int CmpVal) {
4466 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4469 /// Val is either the undef or zero sentinel value.
4470 static bool isUndefOrZero(int Val) {
4471 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4474 /// Return true if every element in Mask, beginning
4475 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4476 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4477 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4478 if (Mask[i] != SM_SentinelUndef)
4483 /// Return true if Val is undef or if its value falls within the
4484 /// specified range (L, H].
4485 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4486 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4489 /// Return true if every element in Mask is undef or if its value
4490 /// falls within the specified range (L, H].
4491 static bool isUndefOrInRange(ArrayRef<int> Mask,
4494 if (!isUndefOrInRange(M, Low, Hi))
4499 /// Return true if Val is undef, zero or if its value falls within the
4500 /// specified range (L, H].
4501 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4502 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4505 /// Return true if every element in Mask is undef, zero or if its value
4506 /// falls within the specified range (L, H].
4507 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4509 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4514 /// Return true if every element in Mask, beginning
4515 /// from position Pos and ending in Pos+Size, falls within the specified
4516 /// sequential range (Low, Low+Size]. or is undef.
4517 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4518 unsigned Pos, unsigned Size, int Low) {
4519 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4520 if (!isUndefOrEqual(Mask[i], Low))
4525 /// Return true if every element in Mask, beginning
4526 /// from position Pos and ending in Pos+Size, falls within the specified
4527 /// sequential range (Low, Low+Size], or is undef or is zero.
4528 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4529 unsigned Size, int Low) {
4530 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4531 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4536 /// Return true if every element in Mask, beginning
4537 /// from position Pos and ending in Pos+Size is undef or is zero.
4538 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4540 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4541 if (!isUndefOrZero(Mask[i]))
4546 /// \brief Helper function to test whether a shuffle mask could be
4547 /// simplified by widening the elements being shuffled.
4549 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4550 /// leaves it in an unspecified state.
4552 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4553 /// shuffle masks. The latter have the special property of a '-2' representing
4554 /// a zero-ed lane of a vector.
4555 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4556 SmallVectorImpl<int> &WidenedMask) {
4557 WidenedMask.assign(Mask.size() / 2, 0);
4558 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4559 // If both elements are undef, its trivial.
4560 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
4561 WidenedMask[i / 2] = SM_SentinelUndef;
4565 // Check for an undef mask and a mask value properly aligned to fit with
4566 // a pair of values. If we find such a case, use the non-undef mask's value.
4567 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
4568 Mask[i + 1] % 2 == 1) {
4569 WidenedMask[i / 2] = Mask[i + 1] / 2;
4572 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
4573 WidenedMask[i / 2] = Mask[i] / 2;
4577 // When zeroing, we need to spread the zeroing across both lanes to widen.
4578 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
4579 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
4580 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
4581 WidenedMask[i / 2] = SM_SentinelZero;
4587 // Finally check if the two mask values are adjacent and aligned with
4589 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
4590 Mask[i] + 1 == Mask[i + 1]) {
4591 WidenedMask[i / 2] = Mask[i] / 2;
4595 // Otherwise we can't safely widen the elements used in this shuffle.
4598 assert(WidenedMask.size() == Mask.size() / 2 &&
4599 "Incorrect size of mask after widening the elements!");
4604 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4605 /// mask index with the scaled sequential indices for an equivalent narrowed
4606 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4608 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4609 SmallVectorImpl<int> &ScaledMask) {
4610 assert(0 < Scale && "Unexpected scaling factor");
4611 int NumElts = Mask.size();
4612 ScaledMask.assign(NumElts * Scale, -1);
4614 for (int i = 0; i != NumElts; ++i) {
4617 // Repeat sentinel values in every mask element.
4619 for (int s = 0; s != Scale; ++s)
4620 ScaledMask[(Scale * i) + s] = M;
4624 // Scale mask element and increment across each mask element.
4625 for (int s = 0; s != Scale; ++s)
4626 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4630 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4631 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4632 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4633 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4634 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4637 // The index should be aligned on a vecWidth-bit boundary.
4639 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4641 MVT VT = N->getSimpleValueType(0);
4642 unsigned ElSize = VT.getScalarSizeInBits();
4643 bool Result = (Index * ElSize) % vecWidth == 0;
4648 /// Return true if the specified INSERT_SUBVECTOR
4649 /// operand specifies a subvector insert that is suitable for input to
4650 /// insertion of 128 or 256-bit subvectors
4651 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4652 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4653 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4655 // The index should be aligned on a vecWidth-bit boundary.
4657 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4659 MVT VT = N->getSimpleValueType(0);
4660 unsigned ElSize = VT.getScalarSizeInBits();
4661 bool Result = (Index * ElSize) % vecWidth == 0;
4666 bool X86::isVINSERT128Index(SDNode *N) {
4667 return isVINSERTIndex(N, 128);
4670 bool X86::isVINSERT256Index(SDNode *N) {
4671 return isVINSERTIndex(N, 256);
4674 bool X86::isVEXTRACT128Index(SDNode *N) {
4675 return isVEXTRACTIndex(N, 128);
4678 bool X86::isVEXTRACT256Index(SDNode *N) {
4679 return isVEXTRACTIndex(N, 256);
4682 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4683 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4684 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4685 "Illegal extract subvector for VEXTRACT");
4688 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4690 MVT VecVT = N->getOperand(0).getSimpleValueType();
4691 MVT ElVT = VecVT.getVectorElementType();
4693 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4694 return Index / NumElemsPerChunk;
4697 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4698 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4699 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4700 "Illegal insert subvector for VINSERT");
4703 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4705 MVT VecVT = N->getSimpleValueType(0);
4706 MVT ElVT = VecVT.getVectorElementType();
4708 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4709 return Index / NumElemsPerChunk;
4712 /// Return the appropriate immediate to extract the specified
4713 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4714 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4715 return getExtractVEXTRACTImmediate(N, 128);
4718 /// Return the appropriate immediate to extract the specified
4719 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4720 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4721 return getExtractVEXTRACTImmediate(N, 256);
4724 /// Return the appropriate immediate to insert at the specified
4725 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4726 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4727 return getInsertVINSERTImmediate(N, 128);
4730 /// Return the appropriate immediate to insert at the specified
4731 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4732 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4733 return getInsertVINSERTImmediate(N, 256);
4736 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4737 bool X86::isZeroNode(SDValue Elt) {
4738 return isNullConstant(Elt) || isNullFPConstant(Elt);
4741 // Build a vector of constants
4742 // Use an UNDEF node if MaskElt == -1.
4743 // Spilt 64-bit constants in the 32-bit mode.
4744 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4745 const SDLoc &dl, bool IsMask = false) {
4747 SmallVector<SDValue, 32> Ops;
4750 MVT ConstVecVT = VT;
4751 unsigned NumElts = VT.getVectorNumElements();
4752 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4753 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4754 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4758 MVT EltVT = ConstVecVT.getVectorElementType();
4759 for (unsigned i = 0; i < NumElts; ++i) {
4760 bool IsUndef = Values[i] < 0 && IsMask;
4761 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4762 DAG.getConstant(Values[i], dl, EltVT);
4763 Ops.push_back(OpNode);
4765 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4766 DAG.getConstant(0, dl, EltVT));
4768 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4770 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4774 static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
4775 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4776 assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
4777 SmallVector<SDValue, 32> Ops;
4780 MVT ConstVecVT = VT;
4781 unsigned NumElts = VT.getVectorNumElements();
4782 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4783 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4784 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4788 MVT EltVT = ConstVecVT.getVectorElementType();
4789 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4791 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4794 const APInt &V = Bits[i];
4795 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4797 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4798 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4799 } else if (EltVT == MVT::f32) {
4800 APFloat FV(APFloat::IEEEsingle(), V);
4801 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4802 } else if (EltVT == MVT::f64) {
4803 APFloat FV(APFloat::IEEEdouble(), V);
4804 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4806 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4810 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4811 return DAG.getBitcast(VT, ConstsNode);
4814 /// Returns a vector of specified type with all zero elements.
4815 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4816 SelectionDAG &DAG, const SDLoc &dl) {
4817 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4818 VT.getVectorElementType() == MVT::i1) &&
4819 "Unexpected vector type");
4821 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4822 // type. This ensures they get CSE'd. But if the integer type is not
4823 // available, use a floating-point +0.0 instead.
4825 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4826 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4827 } else if (VT.getVectorElementType() == MVT::i1) {
4828 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4829 "Unexpected vector type");
4830 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4831 "Unexpected vector type");
4832 Vec = DAG.getConstant(0, dl, VT);
4834 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4835 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4837 return DAG.getBitcast(VT, Vec);
4840 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4841 const SDLoc &dl, unsigned vectorWidth) {
4842 EVT VT = Vec.getValueType();
4843 EVT ElVT = VT.getVectorElementType();
4844 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4845 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4846 VT.getVectorNumElements()/Factor);
4848 // Extract from UNDEF is UNDEF.
4850 return DAG.getUNDEF(ResultVT);
4852 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4853 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4854 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4856 // This is the index of the first element of the vectorWidth-bit chunk
4857 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4858 IdxVal &= ~(ElemsPerChunk - 1);
4860 // If the input is a buildvector just emit a smaller one.
4861 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4862 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
4863 makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4865 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4866 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4869 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4870 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4871 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4872 /// instructions or a simple subregister reference. Idx is an index in the
4873 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4874 /// lowering EXTRACT_VECTOR_ELT operations easier.
4875 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4876 SelectionDAG &DAG, const SDLoc &dl) {
4877 assert((Vec.getValueType().is256BitVector() ||
4878 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4879 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4882 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4883 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4884 SelectionDAG &DAG, const SDLoc &dl) {
4885 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4886 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4889 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4890 SelectionDAG &DAG, const SDLoc &dl,
4891 unsigned vectorWidth) {
4892 assert((vectorWidth == 128 || vectorWidth == 256) &&
4893 "Unsupported vector width");
4894 // Inserting UNDEF is Result
4897 EVT VT = Vec.getValueType();
4898 EVT ElVT = VT.getVectorElementType();
4899 EVT ResultVT = Result.getValueType();
4901 // Insert the relevant vectorWidth bits.
4902 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4903 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4905 // This is the index of the first element of the vectorWidth-bit chunk
4906 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4907 IdxVal &= ~(ElemsPerChunk - 1);
4909 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4910 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4913 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4914 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4915 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4916 /// simple superregister reference. Idx is an index in the 128 bits
4917 /// we want. It need not be aligned to a 128-bit boundary. That makes
4918 /// lowering INSERT_VECTOR_ELT operations easier.
4919 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4920 SelectionDAG &DAG, const SDLoc &dl) {
4921 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4923 // For insertion into the zero index (low half) of a 256-bit vector, it is
4924 // more efficient to generate a blend with immediate instead of an insert*128.
4925 // We are still creating an INSERT_SUBVECTOR below with an undef node to
4926 // extend the subvector to the size of the result vector. Make sure that
4927 // we are not recursing on that node by checking for undef here.
4928 if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4929 !Result.isUndef()) {
4930 EVT ResultVT = Result.getValueType();
4931 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4932 SDValue Undef = DAG.getUNDEF(ResultVT);
4933 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4936 // The blend instruction, and therefore its mask, depend on the data type.
4937 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4938 if (ScalarType.isFloatingPoint()) {
4939 // Choose either vblendps (float) or vblendpd (double).
4940 unsigned ScalarSize = ScalarType.getSizeInBits();
4941 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4942 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4943 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4944 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4947 const X86Subtarget &Subtarget =
4948 static_cast<const X86Subtarget &>(DAG.getSubtarget());
4950 // AVX2 is needed for 256-bit integer blend support.
4951 // Integers must be cast to 32-bit because there is only vpblendd;
4952 // vpblendw can't be used for this because it has a handicapped mask.
4954 // If we don't have AVX2, then cast to float. Using a wrong domain blend
4955 // is still more efficient than using the wrong domain vinsertf128 that
4956 // will be created by InsertSubVector().
4957 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4959 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4960 Result = DAG.getBitcast(CastVT, Result);
4961 Vec256 = DAG.getBitcast(CastVT, Vec256);
4962 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4963 return DAG.getBitcast(ResultVT, Vec256);
4966 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4969 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4970 SelectionDAG &DAG, const SDLoc &dl) {
4971 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4972 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4975 /// Insert i1-subvector to i1-vector.
4976 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4977 const X86Subtarget &Subtarget) {
4980 SDValue Vec = Op.getOperand(0);
4981 SDValue SubVec = Op.getOperand(1);
4982 SDValue Idx = Op.getOperand(2);
4984 if (!isa<ConstantSDNode>(Idx))
4987 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4988 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4991 MVT OpVT = Op.getSimpleValueType();
4992 MVT SubVecVT = SubVec.getSimpleValueType();
4993 unsigned NumElems = OpVT.getVectorNumElements();
4994 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4996 assert(IdxVal + SubVecNumElems <= NumElems &&
4997 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4998 "Unexpected index value in INSERT_SUBVECTOR");
5000 // There are 3 possible cases:
5001 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5002 // 2. Subvector should be inserted in the upper part
5003 // (IdxVal + SubVecNumElems == NumElems)
5004 // 3. Subvector should be inserted in the middle (for example v2i1
5005 // to v16i1, index 2)
5007 // extend to natively supported kshift
5008 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5009 MVT WideOpVT = OpVT;
5010 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5013 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5014 SDValue Undef = DAG.getUNDEF(WideOpVT);
5015 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5016 Undef, SubVec, ZeroIdx);
5018 // Extract sub-vector if require.
5019 auto ExtractSubVec = [&](SDValue V) {
5020 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5024 if (Vec.isUndef()) {
5026 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5027 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
5029 return ExtractSubVec(WideSubVec);
5032 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5033 NumElems = WideOpVT.getVectorNumElements();
5034 unsigned ShiftLeft = NumElems - SubVecNumElems;
5035 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5036 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5037 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5038 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
5039 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5040 return ExtractSubVec(Vec);
5044 // Zero lower bits of the Vec
5045 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5046 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5047 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5048 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5049 // Merge them together, SubVec should be zero extended.
5050 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5051 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5053 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5054 return ExtractSubVec(Vec);
5057 // Simple case when we put subvector in the upper part
5058 if (IdxVal + SubVecNumElems == NumElems) {
5059 // Zero upper bits of the Vec
5060 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5061 DAG.getConstant(IdxVal, dl, MVT::i8));
5062 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5063 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5064 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5065 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5066 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5067 return ExtractSubVec(Vec);
5069 // Subvector should be inserted in the middle - use shuffle
5070 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5072 SmallVector<int, 64> Mask;
5073 for (unsigned i = 0; i < NumElems; ++i)
5074 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5076 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5079 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5080 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5081 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5082 /// large BUILD_VECTORS.
5083 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5084 unsigned NumElems, SelectionDAG &DAG,
5086 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5087 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5090 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5091 unsigned NumElems, SelectionDAG &DAG,
5093 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5094 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5097 /// Returns a vector of specified type with all bits set.
5098 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5099 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
5100 /// Then bitcast to their original type, ensuring they get CSE'd.
5101 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
5102 SelectionDAG &DAG, const SDLoc &dl) {
5103 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5104 "Expected a 128/256/512-bit vector type");
5106 APInt Ones = APInt::getAllOnesValue(32);
5107 unsigned NumElts = VT.getSizeInBits() / 32;
5109 if (!Subtarget.hasInt256() && NumElts == 8) {
5110 Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
5111 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5113 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5115 return DAG.getBitcast(VT, Vec);
5118 /// Generate unpacklo/unpackhi shuffle mask.
5119 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5121 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5122 int NumElts = VT.getVectorNumElements();
5123 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5125 for (int i = 0; i < NumElts; ++i) {
5126 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5127 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5128 Pos += (Unary ? 0 : NumElts * (i % 2));
5129 Pos += (Lo ? 0 : NumEltsInLane / 2);
5130 Mask.push_back(Pos);
5134 /// Returns a vector_shuffle node for an unpackl operation.
5135 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5136 SDValue V1, SDValue V2) {
5137 SmallVector<int, 8> Mask;
5138 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5139 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5142 /// Returns a vector_shuffle node for an unpackh operation.
5143 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5144 SDValue V1, SDValue V2) {
5145 SmallVector<int, 8> Mask;
5146 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5147 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5150 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5151 /// This produces a shuffle where the low element of V2 is swizzled into the
5152 /// zero/undef vector, landing at element Idx.
5153 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5154 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5156 const X86Subtarget &Subtarget,
5157 SelectionDAG &DAG) {
5158 MVT VT = V2.getSimpleValueType();
5160 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5161 int NumElems = VT.getVectorNumElements();
5162 SmallVector<int, 16> MaskVec(NumElems);
5163 for (int i = 0; i != NumElems; ++i)
5164 // If this is the insertion idx, put the low elt of V2 here.
5165 MaskVec[i] = (i == Idx) ? NumElems : i;
5166 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5169 static SDValue peekThroughBitcasts(SDValue V) {
5170 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5171 V = V.getOperand(0);
5175 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5176 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5177 V.getOperand(0).hasOneUse())
5178 V = V.getOperand(0);
5182 static const Constant *getTargetConstantFromNode(SDValue Op) {
5183 Op = peekThroughBitcasts(Op);
5185 auto *Load = dyn_cast<LoadSDNode>(Op);
5189 SDValue Ptr = Load->getBasePtr();
5190 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5191 Ptr->getOpcode() == X86ISD::WrapperRIP)
5192 Ptr = Ptr->getOperand(0);
5194 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5195 if (!CNode || CNode->isMachineConstantPoolEntry())
5198 return dyn_cast<Constant>(CNode->getConstVal());
5201 // Extract raw constant bits from constant pools.
5202 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5203 SmallBitVector &UndefElts,
5204 SmallVectorImpl<APInt> &EltBits) {
5205 assert(UndefElts.empty() && "Expected an empty UndefElts vector");
5206 assert(EltBits.empty() && "Expected an empty EltBits vector");
5208 Op = peekThroughBitcasts(Op);
5210 EVT VT = Op.getValueType();
5211 unsigned SizeInBits = VT.getSizeInBits();
5212 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5213 unsigned NumElts = SizeInBits / EltSizeInBits;
5215 // Extract all the undef/constant element data and pack into single bitsets.
5216 APInt UndefBits(SizeInBits, 0);
5217 APInt MaskBits(SizeInBits, 0);
5219 // Split the undef/constant single bitset data into the target elements.
5220 auto SplitBitData = [&]() {
5221 UndefElts = SmallBitVector(NumElts, false);
5222 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5224 for (unsigned i = 0; i != NumElts; ++i) {
5225 APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
5226 UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
5228 // Only treat an element as UNDEF if all bits are UNDEF, otherwise
5229 // treat it as zero.
5230 if (UndefEltBits.isAllOnesValue()) {
5231 UndefElts[i] = true;
5235 APInt Bits = MaskBits.lshr(i * EltSizeInBits);
5236 Bits = Bits.zextOrTrunc(EltSizeInBits);
5237 EltBits[i] = Bits.getZExtValue();
5242 auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
5246 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5247 if (isa<UndefValue>(Cst)) {
5248 Mask = APInt::getNullValue(SizeInBits);
5249 Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
5252 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5253 Mask = CInt->getValue().zextOrTrunc(SizeInBits);
5254 Undefs = APInt::getNullValue(SizeInBits);
5257 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5258 Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
5259 Undefs = APInt::getNullValue(SizeInBits);
5265 // Extract constant bits from constant pool vector.
5266 if (auto *Cst = getTargetConstantFromNode(Op)) {
5267 Type *CstTy = Cst->getType();
5268 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5271 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5272 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
5274 if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
5276 MaskBits |= Bits.shl(i * CstEltSizeInBits);
5277 UndefBits |= Undefs.shl(i * CstEltSizeInBits);
5280 return SplitBitData();
5283 // Extract constant bits from a broadcasted constant pool scalar.
5284 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5285 EltSizeInBits <= Op.getScalarValueSizeInBits()) {
5286 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5288 if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
5289 unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
5290 unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
5291 for (unsigned i = 0; i != NumBroadcastElts; ++i) {
5292 MaskBits |= Bits.shl(i * NumBroadcastBits);
5293 UndefBits |= Undefs.shl(i * NumBroadcastBits);
5295 return SplitBitData();
5303 // TODO: Merge more of this with getTargetConstantBitsFromNode.
5304 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5305 unsigned MaskEltSizeInBits,
5306 SmallVectorImpl<uint64_t> &RawMask) {
5307 MaskNode = peekThroughBitcasts(MaskNode);
5309 MVT VT = MaskNode.getSimpleValueType();
5310 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
5311 unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
5313 // Split an APInt element into MaskEltSizeInBits sized pieces and
5314 // insert into the shuffle mask.
5315 auto SplitElementToMask = [&](APInt Element) {
5316 // Note that this is x86 and so always little endian: the low byte is
5317 // the first byte of the mask.
5318 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5319 for (int i = 0; i < Split; ++i) {
5320 APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
5321 Element = Element.lshr(MaskEltSizeInBits);
5322 RawMask.push_back(RawElt.getZExtValue());
5326 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
5327 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5328 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
5329 if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
5331 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
5332 const APInt &MaskElement = CN->getAPIntValue();
5333 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
5334 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
5335 RawMask.push_back(RawElt.getZExtValue());
5341 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
5342 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
5343 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
5344 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
5345 if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
5346 RawMask.push_back(CN->getZExtValue());
5347 RawMask.append(NumMaskElts - 1, 0);
5351 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
5352 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5353 SplitElementToMask(CN->getAPIntValue());
5354 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
5361 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
5364 // We can always decode if the buildvector is all zero constants,
5365 // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
5366 if (all_of(MaskNode->ops(), X86::isZeroNode)) {
5367 RawMask.append(NumMaskElts, 0);
5371 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5372 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
5375 for (SDValue Op : MaskNode->ops()) {
5376 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
5377 SplitElementToMask(CN->getAPIntValue());
5378 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
5379 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
5387 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5388 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5389 /// operands in \p Ops, and returns true.
5390 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5391 /// IsUnary for shuffles which use a single input multiple times, and in those
5392 /// cases it will adjust the mask to only have indices within that single input.
5393 /// It is an error to call this with non-empty Mask/Ops vectors.
5394 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5395 SmallVectorImpl<SDValue> &Ops,
5396 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5397 unsigned NumElems = VT.getVectorNumElements();
5400 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5401 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5404 bool IsFakeUnary = false;
5405 switch(N->getOpcode()) {
5406 case X86ISD::BLENDI:
5407 ImmN = N->getOperand(N->getNumOperands()-1);
5408 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411 ImmN = N->getOperand(N->getNumOperands()-1);
5412 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5413 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5415 case X86ISD::INSERTPS:
5416 ImmN = N->getOperand(N->getNumOperands()-1);
5417 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5418 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5420 case X86ISD::UNPCKH:
5421 DecodeUNPCKHMask(VT, Mask);
5422 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5424 case X86ISD::UNPCKL:
5425 DecodeUNPCKLMask(VT, Mask);
5426 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5428 case X86ISD::MOVHLPS:
5429 DecodeMOVHLPSMask(NumElems, Mask);
5430 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5432 case X86ISD::MOVLHPS:
5433 DecodeMOVLHPSMask(NumElems, Mask);
5434 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5436 case X86ISD::PALIGNR:
5437 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5438 ImmN = N->getOperand(N->getNumOperands()-1);
5439 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5440 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5441 Ops.push_back(N->getOperand(1));
5442 Ops.push_back(N->getOperand(0));
5444 case X86ISD::VSHLDQ:
5445 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5446 ImmN = N->getOperand(N->getNumOperands() - 1);
5447 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5450 case X86ISD::VSRLDQ:
5451 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5452 ImmN = N->getOperand(N->getNumOperands() - 1);
5453 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5456 case X86ISD::PSHUFD:
5457 case X86ISD::VPERMILPI:
5458 ImmN = N->getOperand(N->getNumOperands()-1);
5459 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5462 case X86ISD::PSHUFHW:
5463 ImmN = N->getOperand(N->getNumOperands()-1);
5464 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5467 case X86ISD::PSHUFLW:
5468 ImmN = N->getOperand(N->getNumOperands()-1);
5469 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5472 case X86ISD::VZEXT_MOVL:
5473 DecodeZeroMoveLowMask(VT, Mask);
5476 case X86ISD::VBROADCAST: {
5477 // We only decode broadcasts of same-sized vectors at the moment.
5478 if (N->getOperand(0).getValueType() == VT) {
5479 DecodeVectorBroadcast(VT, Mask);
5485 case X86ISD::VPERMILPV: {
5487 SDValue MaskNode = N->getOperand(1);
5488 unsigned MaskEltSize = VT.getScalarSizeInBits();
5489 SmallVector<uint64_t, 32> RawMask;
5490 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5491 DecodeVPERMILPMask(VT, RawMask, Mask);
5494 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5495 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5500 case X86ISD::PSHUFB: {
5502 SDValue MaskNode = N->getOperand(1);
5503 SmallVector<uint64_t, 32> RawMask;
5504 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5505 DecodePSHUFBMask(RawMask, Mask);
5508 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5509 DecodePSHUFBMask(C, Mask);
5514 case X86ISD::VPERMI:
5515 ImmN = N->getOperand(N->getNumOperands()-1);
5516 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5521 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5523 case X86ISD::VPERM2X128:
5524 ImmN = N->getOperand(N->getNumOperands()-1);
5525 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5526 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5528 case X86ISD::MOVSLDUP:
5529 DecodeMOVSLDUPMask(VT, Mask);
5532 case X86ISD::MOVSHDUP:
5533 DecodeMOVSHDUPMask(VT, Mask);
5536 case X86ISD::MOVDDUP:
5537 DecodeMOVDDUPMask(VT, Mask);
5540 case X86ISD::MOVLHPD:
5541 case X86ISD::MOVLPD:
5542 case X86ISD::MOVLPS:
5543 // Not yet implemented
5545 case X86ISD::VPERMIL2: {
5546 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5547 unsigned MaskEltSize = VT.getScalarSizeInBits();
5548 SDValue MaskNode = N->getOperand(2);
5549 SDValue CtrlNode = N->getOperand(3);
5550 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5551 unsigned CtrlImm = CtrlOp->getZExtValue();
5552 SmallVector<uint64_t, 32> RawMask;
5553 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5554 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5557 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5558 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5564 case X86ISD::VPPERM: {
5565 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5566 SDValue MaskNode = N->getOperand(2);
5567 SmallVector<uint64_t, 32> RawMask;
5568 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5569 DecodeVPPERMMask(RawMask, Mask);
5572 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5573 DecodeVPPERMMask(C, Mask);
5578 case X86ISD::VPERMV: {
5580 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5581 Ops.push_back(N->getOperand(1));
5582 SDValue MaskNode = N->getOperand(0);
5583 SmallVector<uint64_t, 32> RawMask;
5584 unsigned MaskEltSize = VT.getScalarSizeInBits();
5585 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5586 DecodeVPERMVMask(RawMask, Mask);
5589 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5590 DecodeVPERMVMask(C, MaskEltSize, Mask);
5595 case X86ISD::VPERMV3: {
5596 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5597 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5598 Ops.push_back(N->getOperand(0));
5599 Ops.push_back(N->getOperand(2));
5600 SDValue MaskNode = N->getOperand(1);
5601 unsigned MaskEltSize = VT.getScalarSizeInBits();
5602 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5603 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5608 case X86ISD::VPERMIV3: {
5609 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5610 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5611 Ops.push_back(N->getOperand(1));
5612 Ops.push_back(N->getOperand(2));
5613 SDValue MaskNode = N->getOperand(0);
5614 unsigned MaskEltSize = VT.getScalarSizeInBits();
5615 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5616 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5621 default: llvm_unreachable("unknown target shuffle node");
5624 // Empty mask indicates the decode failed.
5628 // Check if we're getting a shuffle mask with zero'd elements.
5629 if (!AllowSentinelZero)
5630 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5633 // If we have a fake unary shuffle, the shuffle mask is spread across two
5634 // inputs that are actually the same node. Re-map the mask to always point
5635 // into the first input.
5638 if (M >= (int)Mask.size())
5641 // If we didn't already add operands in the opcode-specific code, default to
5642 // adding 1 or 2 operands starting at 0.
5644 Ops.push_back(N->getOperand(0));
5645 if (!IsUnary || IsFakeUnary)
5646 Ops.push_back(N->getOperand(1));
5652 /// Check a target shuffle mask's inputs to see if we can set any values to
5653 /// SM_SentinelZero - this is for elements that are known to be zero
5654 /// (not just zeroable) from their inputs.
5655 /// Returns true if the target shuffle mask was decoded.
5656 static bool setTargetShuffleZeroElements(SDValue N,
5657 SmallVectorImpl<int> &Mask,
5658 SmallVectorImpl<SDValue> &Ops) {
5660 if (!isTargetShuffle(N.getOpcode()))
5663 MVT VT = N.getSimpleValueType();
5664 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5667 SDValue V1 = Ops[0];
5668 SDValue V2 = IsUnary ? V1 : Ops[1];
5670 V1 = peekThroughBitcasts(V1);
5671 V2 = peekThroughBitcasts(V2);
5673 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5676 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5680 // Determine shuffle input and normalize the mask.
5681 SDValue V = M < Size ? V1 : V2;
5684 // We are referencing an UNDEF input.
5686 Mask[i] = SM_SentinelUndef;
5690 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5691 if (V.getOpcode() != ISD::BUILD_VECTOR)
5694 // If the BUILD_VECTOR has fewer elements then the (larger) source
5695 // element must be UNDEF/ZERO.
5696 // TODO: Is it worth testing the individual bits of a constant?
5697 if ((Size % V.getNumOperands()) == 0) {
5698 int Scale = Size / V->getNumOperands();
5699 SDValue Op = V.getOperand(M / Scale);
5701 Mask[i] = SM_SentinelUndef;
5702 else if (X86::isZeroNode(Op))
5703 Mask[i] = SM_SentinelZero;
5707 // If the BUILD_VECTOR has more elements then all the (smaller) source
5708 // elements must be all UNDEF or all ZERO.
5709 if ((V.getNumOperands() % Size) == 0) {
5710 int Scale = V->getNumOperands() / Size;
5711 bool AllUndef = true;
5712 bool AllZero = true;
5713 for (int j = 0; j < Scale; ++j) {
5714 SDValue Op = V.getOperand((M * Scale) + j);
5715 AllUndef &= Op.isUndef();
5716 AllZero &= X86::isZeroNode(Op);
5719 Mask[i] = SM_SentinelUndef;
5721 Mask[i] = SM_SentinelZero;
5726 assert(VT.getVectorNumElements() == Mask.size() &&
5727 "Different mask size from vector size!");
5731 // Attempt to decode ops that could be represented as a shuffle mask.
5732 // The decoded shuffle mask may contain a different number of elements to the
5733 // destination value type.
5734 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5735 SmallVectorImpl<SDValue> &Ops) {
5739 MVT VT = N.getSimpleValueType();
5740 unsigned NumElts = VT.getVectorNumElements();
5741 unsigned NumSizeInBits = VT.getSizeInBits();
5742 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5743 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5744 "Expected byte aligned value types");
5746 unsigned Opcode = N.getOpcode();
5749 // Attempt to decode as a per-byte mask.
5750 SmallBitVector UndefElts;
5751 SmallVector<APInt, 32> EltBits;
5752 if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
5754 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5756 Mask.push_back(SM_SentinelUndef);
5759 uint64_t ByteBits = EltBits[i].getZExtValue();
5760 if (ByteBits != 0 && ByteBits != 255)
5762 Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
5764 Ops.push_back(N.getOperand(0));
5768 case X86ISD::VSRLI: {
5769 uint64_t ShiftVal = N.getConstantOperandVal(1);
5770 // Out of range bit shifts are guaranteed to be zero.
5771 if (NumBitsPerElt <= ShiftVal) {
5772 Mask.append(NumElts, SM_SentinelZero);
5776 // We can only decode 'whole byte' bit shifts as shuffles.
5777 if ((ShiftVal % 8) != 0)
5780 uint64_t ByteShift = ShiftVal / 8;
5781 unsigned NumBytes = NumSizeInBits / 8;
5782 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5783 Ops.push_back(N.getOperand(0));
5785 // Clear mask to all zeros and insert the shifted byte indices.
5786 Mask.append(NumBytes, SM_SentinelZero);
5788 if (X86ISD::VSHLI == Opcode) {
5789 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5790 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5791 Mask[i + j] = i + j - ByteShift;
5793 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5794 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5795 Mask[i + j - ByteShift] = i + j;
5799 case X86ISD::VZEXT: {
5800 // TODO - add support for VPMOVZX with smaller input vector types.
5801 SDValue Src = N.getOperand(0);
5802 MVT SrcVT = Src.getSimpleValueType();
5803 if (NumSizeInBits != SrcVT.getSizeInBits())
5805 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5814 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5815 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5816 /// remaining input indices in case we now have a unary shuffle and adjust the
5817 /// Op0/Op1 inputs accordingly.
5818 /// Returns true if the target shuffle mask was decoded.
5819 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5820 SmallVectorImpl<int> &Mask) {
5821 SmallVector<SDValue, 2> Ops;
5822 if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5823 if (!getFauxShuffleMask(Op, Mask, Ops))
5826 int NumElts = Mask.size();
5827 bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
5828 return 0 <= Idx && Idx < NumElts;
5830 bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
5832 Op0 = Op0InUse ? Ops[0] : SDValue();
5833 Op1 = Op1InUse ? Ops[1] : SDValue();
5835 // We're only using Op1 - commute the mask and inputs.
5836 if (!Op0InUse && Op1InUse) {
5847 /// Returns the scalar element that will make up the ith
5848 /// element of the result of the vector shuffle.
5849 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5852 return SDValue(); // Limit search depth.
5854 SDValue V = SDValue(N, 0);
5855 EVT VT = V.getValueType();
5856 unsigned Opcode = V.getOpcode();
5858 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5859 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5860 int Elt = SV->getMaskElt(Index);
5863 return DAG.getUNDEF(VT.getVectorElementType());
5865 unsigned NumElems = VT.getVectorNumElements();
5866 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5867 : SV->getOperand(1);
5868 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5871 // Recurse into target specific vector shuffles to find scalars.
5872 if (isTargetShuffle(Opcode)) {
5873 MVT ShufVT = V.getSimpleValueType();
5874 MVT ShufSVT = ShufVT.getVectorElementType();
5875 int NumElems = (int)ShufVT.getVectorNumElements();
5876 SmallVector<int, 16> ShuffleMask;
5877 SmallVector<SDValue, 16> ShuffleOps;
5880 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5883 int Elt = ShuffleMask[Index];
5884 if (Elt == SM_SentinelZero)
5885 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5886 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5887 if (Elt == SM_SentinelUndef)
5888 return DAG.getUNDEF(ShufSVT);
5890 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5891 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5892 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5896 // Actual nodes that may contain scalar elements
5897 if (Opcode == ISD::BITCAST) {
5898 V = V.getOperand(0);
5899 EVT SrcVT = V.getValueType();
5900 unsigned NumElems = VT.getVectorNumElements();
5902 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5906 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5907 return (Index == 0) ? V.getOperand(0)
5908 : DAG.getUNDEF(VT.getVectorElementType());
5910 if (V.getOpcode() == ISD::BUILD_VECTOR)
5911 return V.getOperand(Index);
5916 /// Custom lower build_vector of v16i8.
5917 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5918 unsigned NumNonZero, unsigned NumZero,
5920 const X86Subtarget &Subtarget,
5921 const TargetLowering &TLI) {
5929 // SSE4.1 - use PINSRB to insert each byte directly.
5930 if (Subtarget.hasSSE41()) {
5931 for (unsigned i = 0; i < 16; ++i) {
5932 bool isNonZero = (NonZeros & (1 << i)) != 0;
5936 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5938 V = DAG.getUNDEF(MVT::v16i8);
5941 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5942 MVT::v16i8, V, Op.getOperand(i),
5943 DAG.getIntPtrConstant(i, dl));
5950 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5951 for (unsigned i = 0; i < 16; ++i) {
5952 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5953 if (ThisIsNonZero && First) {
5955 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5957 V = DAG.getUNDEF(MVT::v8i16);
5962 SDValue ThisElt, LastElt;
5963 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5964 if (LastIsNonZero) {
5965 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5966 MVT::i16, Op.getOperand(i-1));
5968 if (ThisIsNonZero) {
5969 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5970 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5971 ThisElt, DAG.getConstant(8, dl, MVT::i8));
5973 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5977 if (ThisElt.getNode())
5978 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5979 DAG.getIntPtrConstant(i/2, dl));
5983 return DAG.getBitcast(MVT::v16i8, V);
5986 /// Custom lower build_vector of v8i16.
5987 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5988 unsigned NumNonZero, unsigned NumZero,
5990 const X86Subtarget &Subtarget,
5991 const TargetLowering &TLI) {
5998 for (unsigned i = 0; i < 8; ++i) {
5999 bool isNonZero = (NonZeros & (1 << i)) != 0;
6003 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6005 V = DAG.getUNDEF(MVT::v8i16);
6008 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
6009 MVT::v8i16, V, Op.getOperand(i),
6010 DAG.getIntPtrConstant(i, dl));
6017 /// Custom lower build_vector of v4i32 or v4f32.
6018 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6019 const X86Subtarget &Subtarget,
6020 const TargetLowering &TLI) {
6021 // Find all zeroable elements.
6022 std::bitset<4> Zeroable;
6023 for (int i=0; i < 4; ++i) {
6024 SDValue Elt = Op->getOperand(i);
6025 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6027 assert(Zeroable.size() - Zeroable.count() > 1 &&
6028 "We expect at least two non-zero elements!");
6030 // We only know how to deal with build_vector nodes where elements are either
6031 // zeroable or extract_vector_elt with constant index.
6032 SDValue FirstNonZero;
6033 unsigned FirstNonZeroIdx;
6034 for (unsigned i=0; i < 4; ++i) {
6037 SDValue Elt = Op->getOperand(i);
6038 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6039 !isa<ConstantSDNode>(Elt.getOperand(1)))
6041 // Make sure that this node is extracting from a 128-bit vector.
6042 MVT VT = Elt.getOperand(0).getSimpleValueType();
6043 if (!VT.is128BitVector())
6045 if (!FirstNonZero.getNode()) {
6047 FirstNonZeroIdx = i;
6051 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6052 SDValue V1 = FirstNonZero.getOperand(0);
6053 MVT VT = V1.getSimpleValueType();
6055 // See if this build_vector can be lowered as a blend with zero.
6057 unsigned EltMaskIdx, EltIdx;
6059 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6060 if (Zeroable[EltIdx]) {
6061 // The zero vector will be on the right hand side.
6062 Mask[EltIdx] = EltIdx+4;
6066 Elt = Op->getOperand(EltIdx);
6067 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6068 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
6069 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6071 Mask[EltIdx] = EltIdx;
6075 // Let the shuffle legalizer deal with blend operations.
6076 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6077 if (V1.getSimpleValueType() != VT)
6078 V1 = DAG.getBitcast(VT, V1);
6079 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6082 // See if we can lower this build_vector to a INSERTPS.
6083 if (!Subtarget.hasSSE41())
6086 SDValue V2 = Elt.getOperand(0);
6087 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6090 bool CanFold = true;
6091 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6095 SDValue Current = Op->getOperand(i);
6096 SDValue SrcVector = Current->getOperand(0);
6099 CanFold = SrcVector == V1 &&
6100 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
6106 assert(V1.getNode() && "Expected at least two non-zero elements!");
6107 if (V1.getSimpleValueType() != MVT::v4f32)
6108 V1 = DAG.getBitcast(MVT::v4f32, V1);
6109 if (V2.getSimpleValueType() != MVT::v4f32)
6110 V2 = DAG.getBitcast(MVT::v4f32, V2);
6112 // Ok, we can emit an INSERTPS instruction.
6113 unsigned ZMask = Zeroable.to_ulong();
6115 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6116 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6118 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6119 DAG.getIntPtrConstant(InsertPSMask, DL));
6120 return DAG.getBitcast(VT, Result);
6123 /// Return a vector logical shift node.
6124 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6125 SelectionDAG &DAG, const TargetLowering &TLI,
6127 assert(VT.is128BitVector() && "Unknown type for VShift");
6128 MVT ShVT = MVT::v16i8;
6129 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6130 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6131 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6132 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6133 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6134 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6137 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6138 SelectionDAG &DAG) {
6140 // Check if the scalar load can be widened into a vector load. And if
6141 // the address is "base + cst" see if the cst can be "absorbed" into
6142 // the shuffle mask.
6143 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6144 SDValue Ptr = LD->getBasePtr();
6145 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6147 EVT PVT = LD->getValueType(0);
6148 if (PVT != MVT::i32 && PVT != MVT::f32)
6153 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6154 FI = FINode->getIndex();
6156 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6157 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6158 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6159 Offset = Ptr.getConstantOperandVal(1);
6160 Ptr = Ptr.getOperand(0);
6165 // FIXME: 256-bit vector instructions don't require a strict alignment,
6166 // improve this code to support it better.
6167 unsigned RequiredAlign = VT.getSizeInBits()/8;
6168 SDValue Chain = LD->getChain();
6169 // Make sure the stack object alignment is at least 16 or 32.
6170 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6171 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6172 if (MFI.isFixedObjectIndex(FI)) {
6173 // Can't change the alignment. FIXME: It's possible to compute
6174 // the exact stack offset and reference FI + adjust offset instead.
6175 // If someone *really* cares about this. That's the way to implement it.
6178 MFI.setObjectAlignment(FI, RequiredAlign);
6182 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6183 // Ptr + (Offset & ~15).
6186 if ((Offset % RequiredAlign) & 3)
6188 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6191 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6192 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6195 int EltNo = (Offset - StartOffset) >> 2;
6196 unsigned NumElems = VT.getVectorNumElements();
6198 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6199 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6200 LD->getPointerInfo().getWithOffset(StartOffset));
6202 SmallVector<int, 8> Mask(NumElems, EltNo);
6204 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6210 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6211 /// elements can be replaced by a single large load which has the same value as
6212 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6214 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6215 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6216 SDLoc &DL, SelectionDAG &DAG,
6217 bool isAfterLegalize) {
6218 unsigned NumElems = Elts.size();
6220 int LastLoadedElt = -1;
6221 SmallBitVector LoadMask(NumElems, false);
6222 SmallBitVector ZeroMask(NumElems, false);
6223 SmallBitVector UndefMask(NumElems, false);
6225 // For each element in the initializer, see if we've found a load, zero or an
6227 for (unsigned i = 0; i < NumElems; ++i) {
6228 SDValue Elt = peekThroughBitcasts(Elts[i]);
6233 UndefMask[i] = true;
6234 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6236 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6239 // Each loaded element must be the correct fractional portion of the
6240 // requested vector load.
6241 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6246 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6247 "Incomplete element masks");
6249 // Handle Special Cases - all undef or undef/zero.
6250 if (UndefMask.count() == NumElems)
6251 return DAG.getUNDEF(VT);
6253 // FIXME: Should we return this as a BUILD_VECTOR instead?
6254 if ((ZeroMask | UndefMask).count() == NumElems)
6255 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6256 : DAG.getConstantFP(0.0, DL, VT);
6258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6259 int FirstLoadedElt = LoadMask.find_first();
6260 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6261 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6262 EVT LDBaseVT = EltBase.getValueType();
6264 // Consecutive loads can contain UNDEFS but not ZERO elements.
6265 // Consecutive loads with UNDEFs and ZEROs elements require a
6266 // an additional shuffle stage to clear the ZERO elements.
6267 bool IsConsecutiveLoad = true;
6268 bool IsConsecutiveLoadWithZeros = true;
6269 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6271 SDValue Elt = peekThroughBitcasts(Elts[i]);
6272 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6273 if (!DAG.areNonVolatileConsecutiveLoads(
6274 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6275 i - FirstLoadedElt)) {
6276 IsConsecutiveLoad = false;
6277 IsConsecutiveLoadWithZeros = false;
6280 } else if (ZeroMask[i]) {
6281 IsConsecutiveLoad = false;
6285 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6286 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6287 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6288 "Cannot merge volatile loads.");
6290 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6291 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6293 if (LDBase->hasAnyUseOfValue(1)) {
6295 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6296 SDValue(NewLd.getNode(), 1));
6297 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6298 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6299 SDValue(NewLd.getNode(), 1));
6305 // LOAD - all consecutive load/undefs (must start/end with a load).
6306 // If we have found an entire vector of loads and undefs, then return a large
6307 // load of the entire vector width starting at the base pointer.
6308 // If the vector contains zeros, then attempt to shuffle those elements.
6309 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6310 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6311 assert(LDBase && "Did not find base load for merging consecutive loads");
6312 EVT EltVT = LDBase->getValueType(0);
6313 // Ensure that the input vector size for the merged loads matches the
6314 // cumulative size of the input elements.
6315 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6318 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6321 if (IsConsecutiveLoad)
6322 return CreateLoad(VT, LDBase);
6324 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6325 // vector and a zero vector to clear out the zero elements.
6326 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6327 SmallVector<int, 4> ClearMask(NumElems, -1);
6328 for (unsigned i = 0; i < NumElems; ++i) {
6330 ClearMask[i] = i + NumElems;
6331 else if (LoadMask[i])
6334 SDValue V = CreateLoad(VT, LDBase);
6335 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6336 : DAG.getConstantFP(0.0, DL, VT);
6337 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6342 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6344 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6345 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6346 (LoadSize == 32 || LoadSize == 64) &&
6347 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6348 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6349 : MVT::getIntegerVT(LoadSize);
6350 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6351 if (TLI.isTypeLegal(VecVT)) {
6352 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6353 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6355 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6356 LDBase->getPointerInfo(),
6357 LDBase->getAlignment(),
6358 false/*isVolatile*/, true/*ReadMem*/,
6361 // Make sure the newly-created LOAD is in the same position as LDBase in
6362 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6363 // and update uses of LDBase's output chain to use the TokenFactor.
6364 if (LDBase->hasAnyUseOfValue(1)) {
6366 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6367 SDValue(ResNode.getNode(), 1));
6368 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6369 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6370 SDValue(ResNode.getNode(), 1));
6373 return DAG.getBitcast(VT, ResNode);
6380 static Constant *getConstantVector(MVT VT, APInt SplatValue,
6381 unsigned SplatBitSize, LLVMContext &C) {
6382 unsigned ScalarSize = VT.getScalarSizeInBits();
6383 unsigned NumElm = SplatBitSize / ScalarSize;
6385 SmallVector<Constant *, 32> ConstantVec;
6386 for (unsigned i = 0; i < NumElm; i++) {
6387 APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
6389 if (VT.isFloatingPoint()) {
6390 assert((ScalarSize == 32 || ScalarSize == 64) &&
6391 "Unsupported floating point scalar size");
6392 if (ScalarSize == 32)
6393 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6395 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6397 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6398 ConstantVec.push_back(Const);
6400 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6403 static bool isUseOfShuffle(SDNode *N) {
6404 for (auto *U : N->uses()) {
6405 if (isTargetShuffle(U->getOpcode()))
6407 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6408 return isUseOfShuffle(U);
6413 /// Attempt to use the vbroadcast instruction to generate a splat value for the
6414 /// following cases:
6415 /// 1. A splat BUILD_VECTOR which uses:
6416 /// a. A single scalar load, or a constant.
6417 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6418 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6419 /// a scalar load, or a constant.
6421 /// The VBROADCAST node is returned when a pattern is found,
6422 /// or SDValue() otherwise.
6423 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
6424 SelectionDAG &DAG) {
6425 // VBROADCAST requires AVX.
6426 // TODO: Splats could be generated for non-AVX CPUs using SSE
6427 // instructions, but there's less potential gain for only 128-bit vectors.
6428 if (!Subtarget.hasAVX())
6431 MVT VT = BVOp->getSimpleValueType(0);
6434 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6435 "Unsupported vector type for broadcast.");
6437 BitVector UndefElements;
6438 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6440 // We need a splat of a single value to use broadcast, and it doesn't
6441 // make any sense if the value is only in one element of the vector.
6442 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6443 APInt SplatValue, Undef;
6444 unsigned SplatBitSize;
6446 // Check if this is a repeated constant pattern suitable for broadcasting.
6447 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6448 SplatBitSize > VT.getScalarSizeInBits() &&
6449 SplatBitSize < VT.getSizeInBits()) {
6450 // Avoid replacing with broadcast when it's a use of a shuffle
6451 // instruction to preserve the present custom lowering of shuffles.
6452 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6454 // replace BUILD_VECTOR with broadcast of the repeated constants.
6455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6456 LLVMContext *Ctx = DAG.getContext();
6457 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6458 if (Subtarget.hasAVX()) {
6459 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6460 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6461 // Splatted value can fit in one INTEGER constant in constant pool.
6462 // Load the constant and broadcast it.
6463 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6464 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6465 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6466 SDValue CP = DAG.getConstantPool(C, PVT);
6467 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6469 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6471 CVT, dl, DAG.getEntryNode(), CP,
6472 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6474 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6475 MVT::getVectorVT(CVT, Repeat), Ld);
6476 return DAG.getBitcast(VT, Brdcst);
6477 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6478 // Splatted value can fit in one FLOAT constant in constant pool.
6479 // Load the constant and broadcast it.
6480 // AVX have support for 32 and 64 bit broadcast for floats only.
6481 // No 64bit integer in 32bit subtarget.
6482 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6483 Constant *C = SplatBitSize == 32
6484 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6485 SplatValue.bitsToFloat())
6486 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6487 SplatValue.bitsToDouble());
6488 SDValue CP = DAG.getConstantPool(C, PVT);
6489 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6491 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6493 CVT, dl, DAG.getEntryNode(), CP,
6494 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6496 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6497 MVT::getVectorVT(CVT, Repeat), Ld);
6498 return DAG.getBitcast(VT, Brdcst);
6499 } else if (SplatBitSize > 64) {
6500 // Load the vector of constants and broadcast it.
6501 MVT CVT = VT.getScalarType();
6502 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6504 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6505 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6506 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6508 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6509 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6511 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6512 return DAG.getBitcast(VT, Brdcst);
6519 bool ConstSplatVal =
6520 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6522 // Make sure that all of the users of a non-constant load are from the
6523 // BUILD_VECTOR node.
6524 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6527 unsigned ScalarSize = Ld.getValueSizeInBits();
6528 bool IsGE256 = (VT.getSizeInBits() >= 256);
6530 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6531 // instruction to save 8 or more bytes of constant pool data.
6532 // TODO: If multiple splats are generated to load the same constant,
6533 // it may be detrimental to overall size. There needs to be a way to detect
6534 // that condition to know if this is truly a size win.
6535 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6537 // Handle broadcasting a single constant scalar from the constant pool
6539 // On Sandybridge (no AVX2), it is still better to load a constant vector
6540 // from the constant pool and not to broadcast it from a scalar.
6541 // But override that restriction when optimizing for size.
6542 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6543 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6544 EVT CVT = Ld.getValueType();
6545 assert(!CVT.isVector() && "Must not broadcast a vector type");
6547 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6548 // For size optimization, also splat v2f64 and v2i64, and for size opt
6549 // with AVX2, also splat i8 and i16.
6550 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6551 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6552 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6553 const Constant *C = nullptr;
6554 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6555 C = CI->getConstantIntValue();
6556 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6557 C = CF->getConstantFPValue();
6559 assert(C && "Invalid constant type");
6561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6563 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6564 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6566 CVT, dl, DAG.getEntryNode(), CP,
6567 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6570 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6574 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6576 // Handle AVX2 in-register broadcasts.
6577 if (!IsLoad && Subtarget.hasInt256() &&
6578 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6579 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6581 // The scalar source must be a normal load.
6585 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6586 (Subtarget.hasVLX() && ScalarSize == 64))
6587 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6589 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6590 // double since there is no vbroadcastsd xmm
6591 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6592 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6593 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6596 // Unsupported broadcast.
6600 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6601 /// underlying vector and index.
6603 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6605 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6607 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6608 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6611 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6613 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6615 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6616 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6619 // In this case the vector is the extract_subvector expression and the index
6620 // is 2, as specified by the shuffle.
6621 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6622 SDValue ShuffleVec = SVOp->getOperand(0);
6623 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6624 assert(ShuffleVecVT.getVectorElementType() ==
6625 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6627 int ShuffleIdx = SVOp->getMaskElt(Idx);
6628 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6629 ExtractedFromVec = ShuffleVec;
6635 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6636 MVT VT = Op.getSimpleValueType();
6638 // Skip if insert_vec_elt is not supported.
6639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6640 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6644 unsigned NumElems = Op.getNumOperands();
6648 SmallVector<unsigned, 4> InsertIndices;
6649 SmallVector<int, 8> Mask(NumElems, -1);
6651 for (unsigned i = 0; i != NumElems; ++i) {
6652 unsigned Opc = Op.getOperand(i).getOpcode();
6654 if (Opc == ISD::UNDEF)
6657 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6658 // Quit if more than 1 elements need inserting.
6659 if (InsertIndices.size() > 1)
6662 InsertIndices.push_back(i);
6666 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6667 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6668 // Quit if non-constant index.
6669 if (!isa<ConstantSDNode>(ExtIdx))
6671 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6673 // Quit if extracted from vector of different type.
6674 if (ExtractedFromVec.getValueType() != VT)
6677 if (!VecIn1.getNode())
6678 VecIn1 = ExtractedFromVec;
6679 else if (VecIn1 != ExtractedFromVec) {
6680 if (!VecIn2.getNode())
6681 VecIn2 = ExtractedFromVec;
6682 else if (VecIn2 != ExtractedFromVec)
6683 // Quit if more than 2 vectors to shuffle
6687 if (ExtractedFromVec == VecIn1)
6689 else if (ExtractedFromVec == VecIn2)
6690 Mask[i] = Idx + NumElems;
6693 if (!VecIn1.getNode())
6696 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6697 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6698 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6699 unsigned Idx = InsertIndices[i];
6700 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6701 DAG.getIntPtrConstant(Idx, DL));
6707 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6708 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6709 Op.getScalarValueSizeInBits() == 1 &&
6710 "Can not convert non-constant vector");
6711 uint64_t Immediate = 0;
6712 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6713 SDValue In = Op.getOperand(idx);
6715 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6718 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6719 return DAG.getConstant(Immediate, dl, VT);
6721 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6723 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6725 MVT VT = Op.getSimpleValueType();
6726 assert((VT.getVectorElementType() == MVT::i1) &&
6727 "Unexpected type in LowerBUILD_VECTORvXi1!");
6730 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6731 return DAG.getTargetConstant(0, dl, VT);
6733 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6734 return DAG.getTargetConstant(1, dl, VT);
6736 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6737 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6738 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6739 return DAG.getBitcast(VT, Imm);
6740 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6741 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6742 DAG.getIntPtrConstant(0, dl));
6745 // Vector has one or more non-const elements
6746 uint64_t Immediate = 0;
6747 SmallVector<unsigned, 16> NonConstIdx;
6748 bool IsSplat = true;
6749 bool HasConstElts = false;
6751 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6752 SDValue In = Op.getOperand(idx);
6755 if (!isa<ConstantSDNode>(In))
6756 NonConstIdx.push_back(idx);
6758 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6759 HasConstElts = true;
6763 else if (In != Op.getOperand(SplatIdx))
6767 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6769 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6770 DAG.getConstant(1, dl, VT),
6771 DAG.getConstant(0, dl, VT));
6773 // insert elements one by one
6777 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6778 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6780 else if (HasConstElts)
6781 Imm = DAG.getConstant(0, dl, VT);
6783 Imm = DAG.getUNDEF(VT);
6784 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6785 DstVec = DAG.getBitcast(VT, Imm);
6787 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6788 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6789 DAG.getIntPtrConstant(0, dl));
6792 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6793 unsigned InsertIdx = NonConstIdx[i];
6794 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6795 Op.getOperand(InsertIdx),
6796 DAG.getIntPtrConstant(InsertIdx, dl));
6801 /// \brief Return true if \p N implements a horizontal binop and return the
6802 /// operands for the horizontal binop into V0 and V1.
6804 /// This is a helper function of LowerToHorizontalOp().
6805 /// This function checks that the build_vector \p N in input implements a
6806 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6807 /// operation to match.
6808 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6809 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6810 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6813 /// This function only analyzes elements of \p N whose indices are
6814 /// in range [BaseIdx, LastIdx).
6815 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6817 unsigned BaseIdx, unsigned LastIdx,
6818 SDValue &V0, SDValue &V1) {
6819 EVT VT = N->getValueType(0);
6821 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6822 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6823 "Invalid Vector in input!");
6825 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6826 bool CanFold = true;
6827 unsigned ExpectedVExtractIdx = BaseIdx;
6828 unsigned NumElts = LastIdx - BaseIdx;
6829 V0 = DAG.getUNDEF(VT);
6830 V1 = DAG.getUNDEF(VT);
6832 // Check if N implements a horizontal binop.
6833 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6834 SDValue Op = N->getOperand(i + BaseIdx);
6837 if (Op->isUndef()) {
6838 // Update the expected vector extract index.
6839 if (i * 2 == NumElts)
6840 ExpectedVExtractIdx = BaseIdx;
6841 ExpectedVExtractIdx += 2;
6845 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6850 SDValue Op0 = Op.getOperand(0);
6851 SDValue Op1 = Op.getOperand(1);
6853 // Try to match the following pattern:
6854 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6855 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6856 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6857 Op0.getOperand(0) == Op1.getOperand(0) &&
6858 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6859 isa<ConstantSDNode>(Op1.getOperand(1)));
6863 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6864 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6866 if (i * 2 < NumElts) {
6868 V0 = Op0.getOperand(0);
6869 if (V0.getValueType() != VT)
6874 V1 = Op0.getOperand(0);
6875 if (V1.getValueType() != VT)
6878 if (i * 2 == NumElts)
6879 ExpectedVExtractIdx = BaseIdx;
6882 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6883 if (I0 == ExpectedVExtractIdx)
6884 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6885 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6886 // Try to match the following dag sequence:
6887 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6888 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6892 ExpectedVExtractIdx += 2;
6898 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6899 /// a concat_vector.
6901 /// This is a helper function of LowerToHorizontalOp().
6902 /// This function expects two 256-bit vectors called V0 and V1.
6903 /// At first, each vector is split into two separate 128-bit vectors.
6904 /// Then, the resulting 128-bit vectors are used to implement two
6905 /// horizontal binary operations.
6907 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6909 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6910 /// the two new horizontal binop.
6911 /// When Mode is set, the first horizontal binop dag node would take as input
6912 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6913 /// horizontal binop dag node would take as input the lower 128-bit of V1
6914 /// and the upper 128-bit of V1.
6916 /// HADD V0_LO, V0_HI
6917 /// HADD V1_LO, V1_HI
6919 /// Otherwise, the first horizontal binop dag node takes as input the lower
6920 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6921 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6923 /// HADD V0_LO, V1_LO
6924 /// HADD V0_HI, V1_HI
6926 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6927 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6928 /// the upper 128-bits of the result.
6929 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6930 const SDLoc &DL, SelectionDAG &DAG,
6931 unsigned X86Opcode, bool Mode,
6932 bool isUndefLO, bool isUndefHI) {
6933 MVT VT = V0.getSimpleValueType();
6934 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6935 "Invalid nodes in input!");
6937 unsigned NumElts = VT.getVectorNumElements();
6938 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6939 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6940 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6941 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6942 MVT NewVT = V0_LO.getSimpleValueType();
6944 SDValue LO = DAG.getUNDEF(NewVT);
6945 SDValue HI = DAG.getUNDEF(NewVT);
6948 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6949 if (!isUndefLO && !V0->isUndef())
6950 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6951 if (!isUndefHI && !V1->isUndef())
6952 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6954 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6955 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6956 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6958 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6959 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6962 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6965 /// Returns true iff \p BV builds a vector with the result equivalent to
6966 /// the result of ADDSUB operation.
6967 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
6968 /// are written to the parameters \p Opnd0 and \p Opnd1.
6969 static bool isAddSub(const BuildVectorSDNode *BV,
6970 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6971 SDValue &Opnd0, SDValue &Opnd1) {
6973 MVT VT = BV->getSimpleValueType(0);
6974 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6975 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
6976 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
6979 unsigned NumElts = VT.getVectorNumElements();
6980 SDValue InVec0 = DAG.getUNDEF(VT);
6981 SDValue InVec1 = DAG.getUNDEF(VT);
6983 // Odd-numbered elements in the input build vector are obtained from
6984 // adding two integer/float elements.
6985 // Even-numbered elements in the input build vector are obtained from
6986 // subtracting two integer/float elements.
6987 unsigned ExpectedOpcode = ISD::FSUB;
6988 unsigned NextExpectedOpcode = ISD::FADD;
6989 bool AddFound = false;
6990 bool SubFound = false;
6992 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6993 SDValue Op = BV->getOperand(i);
6995 // Skip 'undef' values.
6996 unsigned Opcode = Op.getOpcode();
6997 if (Opcode == ISD::UNDEF) {
6998 std::swap(ExpectedOpcode, NextExpectedOpcode);
7002 // Early exit if we found an unexpected opcode.
7003 if (Opcode != ExpectedOpcode)
7006 SDValue Op0 = Op.getOperand(0);
7007 SDValue Op1 = Op.getOperand(1);
7009 // Try to match the following pattern:
7010 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7011 // Early exit if we cannot match that sequence.
7012 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7013 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7014 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7015 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7016 Op0.getOperand(1) != Op1.getOperand(1))
7019 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7023 // We found a valid add/sub node. Update the information accordingly.
7029 // Update InVec0 and InVec1.
7030 if (InVec0.isUndef()) {
7031 InVec0 = Op0.getOperand(0);
7032 if (InVec0.getSimpleValueType() != VT)
7035 if (InVec1.isUndef()) {
7036 InVec1 = Op1.getOperand(0);
7037 if (InVec1.getSimpleValueType() != VT)
7041 // Make sure that operands in input to each add/sub node always
7042 // come from a same pair of vectors.
7043 if (InVec0 != Op0.getOperand(0)) {
7044 if (ExpectedOpcode == ISD::FSUB)
7047 // FADD is commutable. Try to commute the operands
7048 // and then test again.
7049 std::swap(Op0, Op1);
7050 if (InVec0 != Op0.getOperand(0))
7054 if (InVec1 != Op1.getOperand(0))
7057 // Update the pair of expected opcodes.
7058 std::swap(ExpectedOpcode, NextExpectedOpcode);
7061 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7062 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7070 /// Returns true if is possible to fold MUL and an idiom that has already been
7071 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7072 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7073 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7075 /// Prior to calling this function it should be known that there is some
7076 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7077 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7078 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7079 /// of \p Opnd0 uses is expected to be equal to 2.
7080 /// For example, this function may be called for the following IR:
7081 /// %AB = fmul fast <2 x double> %A, %B
7082 /// %Sub = fsub fast <2 x double> %AB, %C
7083 /// %Add = fadd fast <2 x double> %AB, %C
7084 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7085 /// <2 x i32> <i32 0, i32 3>
7086 /// There is a def for %Addsub here, which potentially can be replaced by
7087 /// X86ISD::ADDSUB operation:
7088 /// %Addsub = X86ISD::ADDSUB %AB, %C
7089 /// and such ADDSUB can further be replaced with FMADDSUB:
7090 /// %Addsub = FMADDSUB %A, %B, %C.
7092 /// The main reason why this method is called before the replacement of the
7093 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7094 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7096 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7097 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7098 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7099 !Subtarget.hasAnyFMA())
7102 // FIXME: These checks must match the similar ones in
7103 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7104 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7105 // or MUL + ADDSUB to FMADDSUB.
7106 const TargetOptions &Options = DAG.getTarget().Options;
7108 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7113 Opnd1 = Opnd0.getOperand(1);
7114 Opnd0 = Opnd0.getOperand(0);
7119 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7120 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7121 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7122 const X86Subtarget &Subtarget,
7123 SelectionDAG &DAG) {
7124 SDValue Opnd0, Opnd1;
7125 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7128 MVT VT = BV->getSimpleValueType(0);
7131 // Try to generate X86ISD::FMADDSUB node here.
7133 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7134 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7136 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7137 // the ADDSUB idiom has been successfully recognized. There are no known
7138 // X86 targets with 512-bit ADDSUB instructions!
7139 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7141 if (VT.is512BitVector())
7144 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7147 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7148 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7149 const X86Subtarget &Subtarget,
7150 SelectionDAG &DAG) {
7151 MVT VT = BV->getSimpleValueType(0);
7152 unsigned NumElts = VT.getVectorNumElements();
7153 unsigned NumUndefsLO = 0;
7154 unsigned NumUndefsHI = 0;
7155 unsigned Half = NumElts/2;
7157 // Count the number of UNDEF operands in the build_vector in input.
7158 for (unsigned i = 0, e = Half; i != e; ++i)
7159 if (BV->getOperand(i)->isUndef())
7162 for (unsigned i = Half, e = NumElts; i != e; ++i)
7163 if (BV->getOperand(i)->isUndef())
7166 // Early exit if this is either a build_vector of all UNDEFs or all the
7167 // operands but one are UNDEF.
7168 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7172 SDValue InVec0, InVec1;
7173 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7174 // Try to match an SSE3 float HADD/HSUB.
7175 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7176 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7178 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7179 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7180 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7181 // Try to match an SSSE3 integer HADD/HSUB.
7182 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7183 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7185 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7186 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7189 if (!Subtarget.hasAVX())
7192 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7193 // Try to match an AVX horizontal add/sub of packed single/double
7194 // precision floating point values from 256-bit vectors.
7195 SDValue InVec2, InVec3;
7196 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7197 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7198 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7199 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7200 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7202 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7203 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7204 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7205 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7206 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7207 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7208 // Try to match an AVX2 horizontal add/sub of signed integers.
7209 SDValue InVec2, InVec3;
7211 bool CanFold = true;
7213 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7214 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7215 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7216 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7217 X86Opcode = X86ISD::HADD;
7218 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7219 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7220 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7221 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7222 X86Opcode = X86ISD::HSUB;
7227 // Fold this build_vector into a single horizontal add/sub.
7228 // Do this only if the target has AVX2.
7229 if (Subtarget.hasAVX2())
7230 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7232 // Do not try to expand this build_vector into a pair of horizontal
7233 // add/sub if we can emit a pair of scalar add/sub.
7234 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7237 // Convert this build_vector into a pair of horizontal binop followed by
7239 bool isUndefLO = NumUndefsLO == Half;
7240 bool isUndefHI = NumUndefsHI == Half;
7241 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7242 isUndefLO, isUndefHI);
7246 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7247 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7249 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7250 X86Opcode = X86ISD::HADD;
7251 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7252 X86Opcode = X86ISD::HSUB;
7253 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7254 X86Opcode = X86ISD::FHADD;
7255 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7256 X86Opcode = X86ISD::FHSUB;
7260 // Don't try to expand this build_vector into a pair of horizontal add/sub
7261 // if we can simply emit a pair of scalar add/sub.
7262 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7265 // Convert this build_vector into two horizontal add/sub followed by
7267 bool isUndefLO = NumUndefsLO == Half;
7268 bool isUndefHI = NumUndefsHI == Half;
7269 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7270 isUndefLO, isUndefHI);
7276 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7277 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7278 /// just apply the bit to the vectors.
7279 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7280 /// from this, but enough scalar bit operations are created from the later
7281 /// legalization + scalarization stages to need basic support.
7282 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7283 SelectionDAG &DAG) {
7285 MVT VT = Op->getSimpleValueType(0);
7286 unsigned NumElems = VT.getVectorNumElements();
7287 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7289 // Check that all elements have the same opcode.
7290 // TODO: Should we allow UNDEFS and if so how many?
7291 unsigned Opcode = Op->getOperand(0).getOpcode();
7292 for (unsigned i = 1; i < NumElems; ++i)
7293 if (Opcode != Op->getOperand(i).getOpcode())
7296 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7303 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7308 SmallVector<SDValue, 4> LHSElts, RHSElts;
7309 for (SDValue Elt : Op->ops()) {
7310 SDValue LHS = Elt.getOperand(0);
7311 SDValue RHS = Elt.getOperand(1);
7313 // We expect the canonicalized RHS operand to be the constant.
7314 if (!isa<ConstantSDNode>(RHS))
7316 LHSElts.push_back(LHS);
7317 RHSElts.push_back(RHS);
7320 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7321 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7322 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7325 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7326 /// functionality to do this, so it's all zeros, all ones, or some derivation
7327 /// that is cheap to calculate.
7328 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7329 const X86Subtarget &Subtarget) {
7331 MVT VT = Op.getSimpleValueType();
7333 // Vectors containing all zeros can be matched by pxor and xorps.
7334 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7335 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7336 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7337 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7340 return getZeroVector(VT, Subtarget, DAG, DL);
7343 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7344 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7345 // vpcmpeqd on 256-bit vectors.
7346 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7347 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7348 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7351 return getOnesVector(VT, Subtarget, DAG, DL);
7358 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7361 MVT VT = Op.getSimpleValueType();
7362 MVT ExtVT = VT.getVectorElementType();
7363 unsigned NumElems = Op.getNumOperands();
7365 // Generate vectors for predicate vectors.
7366 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7367 return LowerBUILD_VECTORvXi1(Op, DAG);
7369 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7370 return VectorConstant;
7372 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7373 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7375 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7376 return HorizontalOp;
7377 if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
7379 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7382 unsigned EVTBits = ExtVT.getSizeInBits();
7384 unsigned NumZero = 0;
7385 unsigned NumNonZero = 0;
7386 uint64_t NonZeros = 0;
7387 bool IsAllConstants = true;
7388 SmallSet<SDValue, 8> Values;
7389 for (unsigned i = 0; i < NumElems; ++i) {
7390 SDValue Elt = Op.getOperand(i);
7394 if (Elt.getOpcode() != ISD::Constant &&
7395 Elt.getOpcode() != ISD::ConstantFP)
7396 IsAllConstants = false;
7397 if (X86::isZeroNode(Elt))
7400 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7401 NonZeros |= ((uint64_t)1 << i);
7406 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7407 if (NumNonZero == 0)
7408 return DAG.getUNDEF(VT);
7410 // Special case for single non-zero, non-undef, element.
7411 if (NumNonZero == 1) {
7412 unsigned Idx = countTrailingZeros(NonZeros);
7413 SDValue Item = Op.getOperand(Idx);
7415 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7416 // the value are obviously zero, truncate the value to i32 and do the
7417 // insertion that way. Only do this if the value is non-constant or if the
7418 // value is a constant being inserted into element 0. It is cheaper to do
7419 // a constant pool load than it is to do a movd + shuffle.
7420 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7421 (!IsAllConstants || Idx == 0)) {
7422 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
7424 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7425 MVT VecVT = MVT::v4i32;
7427 // Truncate the value (which may itself be a constant) to i32, and
7428 // convert it to a vector with movd (S2V+shuffle to zero extend).
7429 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7430 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7431 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7432 Item, Idx * 2, true, Subtarget, DAG));
7436 // If we have a constant or non-constant insertion into the low element of
7437 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7438 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7439 // depending on what the source datatype is.
7442 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7444 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7445 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7446 assert((VT.is128BitVector() || VT.is256BitVector() ||
7447 VT.is512BitVector()) &&
7448 "Expected an SSE value type!");
7449 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7450 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7451 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7454 // We can't directly insert an i8 or i16 into a vector, so zero extend
7456 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7457 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7458 if (VT.getSizeInBits() >= 256) {
7459 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7460 if (Subtarget.hasAVX()) {
7461 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7462 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7464 // Without AVX, we need to extend to a 128-bit vector and then
7465 // insert into the 256-bit vector.
7466 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7467 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7468 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7471 assert(VT.is128BitVector() && "Expected an SSE value type!");
7472 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7473 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7475 return DAG.getBitcast(VT, Item);
7479 // Is it a vector logical left shift?
7480 if (NumElems == 2 && Idx == 1 &&
7481 X86::isZeroNode(Op.getOperand(0)) &&
7482 !X86::isZeroNode(Op.getOperand(1))) {
7483 unsigned NumBits = VT.getSizeInBits();
7484 return getVShift(true, VT,
7485 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7486 VT, Op.getOperand(1)),
7487 NumBits/2, DAG, *this, dl);
7490 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7493 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7494 // is a non-constant being inserted into an element other than the low one,
7495 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7496 // movd/movss) to move this into the low element, then shuffle it into
7498 if (EVTBits == 32) {
7499 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7500 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7504 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7505 if (Values.size() == 1) {
7506 if (EVTBits == 32) {
7507 // Instead of a shuffle like this:
7508 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7509 // Check if it's possible to issue this instead.
7510 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7511 unsigned Idx = countTrailingZeros(NonZeros);
7512 SDValue Item = Op.getOperand(Idx);
7513 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7514 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7519 // A vector full of immediates; various special cases are already
7520 // handled, so this is best done with a single constant-pool load.
7524 // See if we can use a vector load to get all of the elements.
7525 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7526 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7527 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7531 // For AVX-length vectors, build the individual 128-bit pieces and use
7532 // shuffles to put them in place.
7533 if (VT.is256BitVector() || VT.is512BitVector()) {
7534 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7536 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7538 // Build both the lower and upper subvector.
7540 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7541 SDValue Upper = DAG.getBuildVector(
7542 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7544 // Recreate the wider vector with the lower and upper part.
7545 if (VT.is256BitVector())
7546 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7547 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7550 // Let legalizer expand 2-wide build_vectors.
7551 if (EVTBits == 64) {
7552 if (NumNonZero == 1) {
7553 // One half is zero or undef.
7554 unsigned Idx = countTrailingZeros(NonZeros);
7555 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7556 Op.getOperand(Idx));
7557 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7562 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7563 if (EVTBits == 8 && NumElems == 16)
7564 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7565 DAG, Subtarget, *this))
7568 if (EVTBits == 16 && NumElems == 8)
7569 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7570 DAG, Subtarget, *this))
7573 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7574 if (EVTBits == 32 && NumElems == 4)
7575 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
7578 // If element VT is == 32 bits, turn it into a number of shuffles.
7579 if (NumElems == 4 && NumZero > 0) {
7580 SmallVector<SDValue, 8> Ops(NumElems);
7581 for (unsigned i = 0; i < 4; ++i) {
7582 bool isZero = !(NonZeros & (1ULL << i));
7584 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7586 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7589 for (unsigned i = 0; i < 2; ++i) {
7590 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7593 Ops[i] = Ops[i*2]; // Must be a zero vector.
7596 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7599 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7602 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7607 bool Reverse1 = (NonZeros & 0x3) == 2;
7608 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7612 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7613 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7615 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7618 if (Values.size() > 1 && VT.is128BitVector()) {
7619 // Check for a build vector from mostly shuffle plus few inserting.
7620 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7623 // For SSE 4.1, use insertps to put the high elements into the low element.
7624 if (Subtarget.hasSSE41()) {
7626 if (!Op.getOperand(0).isUndef())
7627 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7629 Result = DAG.getUNDEF(VT);
7631 for (unsigned i = 1; i < NumElems; ++i) {
7632 if (Op.getOperand(i).isUndef()) continue;
7633 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7634 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7639 // Otherwise, expand into a number of unpckl*, start by extending each of
7640 // our (non-undef) elements to the full vector width with the element in the
7641 // bottom slot of the vector (which generates no code for SSE).
7642 SmallVector<SDValue, 8> Ops(NumElems);
7643 for (unsigned i = 0; i < NumElems; ++i) {
7644 if (!Op.getOperand(i).isUndef())
7645 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7647 Ops[i] = DAG.getUNDEF(VT);
7650 // Next, we iteratively mix elements, e.g. for v4f32:
7651 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7652 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7653 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7654 unsigned EltStride = NumElems >> 1;
7655 while (EltStride != 0) {
7656 for (unsigned i = 0; i < EltStride; ++i) {
7657 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7658 // then it is safe to just drop this shuffle: V[i] is already in the
7659 // right place, the one element (since it's the first round) being
7660 // inserted as undef can be dropped. This isn't safe for successive
7661 // rounds because they will permute elements within both vectors.
7662 if (Ops[i+EltStride].isUndef() &&
7663 EltStride == NumElems/2)
7666 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7675 // 256-bit AVX can use the vinsertf128 instruction
7676 // to create 256-bit vectors from two other 128-bit ones.
7677 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7679 MVT ResVT = Op.getSimpleValueType();
7681 assert((ResVT.is256BitVector() ||
7682 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7684 SDValue V1 = Op.getOperand(0);
7685 SDValue V2 = Op.getOperand(1);
7686 unsigned NumElems = ResVT.getVectorNumElements();
7687 if (ResVT.is256BitVector())
7688 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7690 if (Op.getNumOperands() == 4) {
7691 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7692 ResVT.getVectorNumElements()/2);
7693 SDValue V3 = Op.getOperand(2);
7694 SDValue V4 = Op.getOperand(3);
7695 return concat256BitVectors(
7696 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7697 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7700 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7703 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7704 const X86Subtarget &Subtarget,
7705 SelectionDAG & DAG) {
7707 MVT ResVT = Op.getSimpleValueType();
7708 unsigned NumOfOperands = Op.getNumOperands();
7710 assert(isPowerOf2_32(NumOfOperands) &&
7711 "Unexpected number of operands in CONCAT_VECTORS");
7713 SDValue Undef = DAG.getUNDEF(ResVT);
7714 if (NumOfOperands > 2) {
7715 // Specialize the cases when all, or all but one, of the operands are undef.
7716 unsigned NumOfDefinedOps = 0;
7718 for (unsigned i = 0; i < NumOfOperands; i++)
7719 if (!Op.getOperand(i).isUndef()) {
7723 if (NumOfDefinedOps == 0)
7725 if (NumOfDefinedOps == 1) {
7726 unsigned SubVecNumElts =
7727 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7728 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7729 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7730 Op.getOperand(OpIdx), IdxVal);
7733 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7734 ResVT.getVectorNumElements()/2);
7735 SmallVector<SDValue, 2> Ops;
7736 for (unsigned i = 0; i < NumOfOperands/2; i++)
7737 Ops.push_back(Op.getOperand(i));
7738 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7740 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7741 Ops.push_back(Op.getOperand(i));
7742 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7743 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7747 SDValue V1 = Op.getOperand(0);
7748 SDValue V2 = Op.getOperand(1);
7749 unsigned NumElems = ResVT.getVectorNumElements();
7750 assert(V1.getValueType() == V2.getValueType() &&
7751 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7752 "Unexpected operands in CONCAT_VECTORS");
7754 if (ResVT.getSizeInBits() >= 16)
7755 return Op; // The operation is legal with KUNPCK
7757 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7758 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7759 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7760 if (IsZeroV1 && IsZeroV2)
7763 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7765 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7767 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7769 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7771 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7774 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7776 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7777 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7780 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7781 const X86Subtarget &Subtarget,
7782 SelectionDAG &DAG) {
7783 MVT VT = Op.getSimpleValueType();
7784 if (VT.getVectorElementType() == MVT::i1)
7785 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7787 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7788 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7789 Op.getNumOperands() == 4)));
7791 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7792 // from two other 128-bit ones.
7794 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7795 return LowerAVXCONCAT_VECTORS(Op, DAG);
7798 //===----------------------------------------------------------------------===//
7799 // Vector shuffle lowering
7801 // This is an experimental code path for lowering vector shuffles on x86. It is
7802 // designed to handle arbitrary vector shuffles and blends, gracefully
7803 // degrading performance as necessary. It works hard to recognize idiomatic
7804 // shuffles and lower them to optimal instruction patterns without leaving
7805 // a framework that allows reasonably efficient handling of all vector shuffle
7807 //===----------------------------------------------------------------------===//
7809 /// \brief Tiny helper function to identify a no-op mask.
7811 /// This is a somewhat boring predicate function. It checks whether the mask
7812 /// array input, which is assumed to be a single-input shuffle mask of the kind
7813 /// used by the X86 shuffle instructions (not a fully general
7814 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7815 /// in-place shuffle are 'no-op's.
7816 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7817 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7818 assert(Mask[i] >= -1 && "Out of bound mask element!");
7819 if (Mask[i] >= 0 && Mask[i] != i)
7825 /// \brief Test whether there are elements crossing 128-bit lanes in this
7828 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7829 /// and we routinely test for these.
7830 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7831 int LaneSize = 128 / VT.getScalarSizeInBits();
7832 int Size = Mask.size();
7833 for (int i = 0; i < Size; ++i)
7834 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7839 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7841 /// This checks a shuffle mask to see if it is performing the same
7842 /// lane-relative shuffle in each sub-lane. This trivially implies
7843 /// that it is also not lane-crossing. It may however involve a blend from the
7844 /// same lane of a second vector.
7846 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7847 /// non-trivial to compute in the face of undef lanes. The representation is
7848 /// suitable for use with existing 128-bit shuffles as entries from the second
7849 /// vector have been remapped to [LaneSize, 2*LaneSize).
7850 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7852 SmallVectorImpl<int> &RepeatedMask) {
7853 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7854 RepeatedMask.assign(LaneSize, -1);
7855 int Size = Mask.size();
7856 for (int i = 0; i < Size; ++i) {
7857 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
7860 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7861 // This entry crosses lanes, so there is no way to model this shuffle.
7864 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7865 // Adjust second vector indices to start at LaneSize instead of Size.
7866 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7867 : Mask[i] % LaneSize + LaneSize;
7868 if (RepeatedMask[i % LaneSize] < 0)
7869 // This is the first non-undef entry in this slot of a 128-bit lane.
7870 RepeatedMask[i % LaneSize] = LocalM;
7871 else if (RepeatedMask[i % LaneSize] != LocalM)
7872 // Found a mismatch with the repeated mask.
7878 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7880 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7881 SmallVectorImpl<int> &RepeatedMask) {
7882 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7885 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7887 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7888 SmallVectorImpl<int> &RepeatedMask) {
7889 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7892 /// Test whether a target shuffle mask is equivalent within each sub-lane.
7893 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
7894 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
7896 SmallVectorImpl<int> &RepeatedMask) {
7897 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7898 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
7899 int Size = Mask.size();
7900 for (int i = 0; i < Size; ++i) {
7901 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
7902 if (Mask[i] == SM_SentinelUndef)
7904 if (Mask[i] == SM_SentinelZero) {
7905 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
7907 RepeatedMask[i % LaneSize] = SM_SentinelZero;
7910 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7911 // This entry crosses lanes, so there is no way to model this shuffle.
7914 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7915 // Adjust second vector indices to start at LaneSize instead of Size.
7917 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
7918 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
7919 // This is the first non-undef entry in this slot of a 128-bit lane.
7920 RepeatedMask[i % LaneSize] = LocalM;
7921 else if (RepeatedMask[i % LaneSize] != LocalM)
7922 // Found a mismatch with the repeated mask.
7928 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7931 /// This is a fast way to test a shuffle mask against a fixed pattern:
7933 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7935 /// It returns true if the mask is exactly as wide as the argument list, and
7936 /// each element of the mask is either -1 (signifying undef) or the value given
7937 /// in the argument.
7938 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7939 ArrayRef<int> ExpectedMask) {
7940 if (Mask.size() != ExpectedMask.size())
7943 int Size = Mask.size();
7945 // If the values are build vectors, we can look through them to find
7946 // equivalent inputs that make the shuffles equivalent.
7947 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7948 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7950 for (int i = 0; i < Size; ++i) {
7951 assert(Mask[i] >= -1 && "Out of bound mask element!");
7952 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7953 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7954 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7955 if (!MaskBV || !ExpectedBV ||
7956 MaskBV->getOperand(Mask[i] % Size) !=
7957 ExpectedBV->getOperand(ExpectedMask[i] % Size))
7965 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7967 /// The masks must be exactly the same width.
7969 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7970 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7972 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
7973 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7974 ArrayRef<int> ExpectedMask) {
7975 int Size = Mask.size();
7976 if (Size != (int)ExpectedMask.size())
7979 for (int i = 0; i < Size; ++i)
7980 if (Mask[i] == SM_SentinelUndef)
7982 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7984 else if (Mask[i] != ExpectedMask[i])
7990 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7992 /// This helper function produces an 8-bit shuffle immediate corresponding to
7993 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7994 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7997 /// NB: We rely heavily on "undef" masks preserving the input lane.
7998 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7999 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8000 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8001 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8002 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8003 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8006 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8007 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8008 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8009 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8013 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
8014 SelectionDAG &DAG) {
8015 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8018 /// \brief Compute whether each element of a shuffle is zeroable.
8020 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8021 /// Either it is an undef element in the shuffle mask, the element of the input
8022 /// referenced is undef, or the element of the input referenced is known to be
8023 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8024 /// as many lanes with this technique as possible to simplify the remaining
8026 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
8027 SDValue V1, SDValue V2) {
8028 SmallBitVector Zeroable(Mask.size(), false);
8029 V1 = peekThroughBitcasts(V1);
8030 V2 = peekThroughBitcasts(V2);
8032 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8033 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8035 int VectorSizeInBits = V1.getValueSizeInBits();
8036 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8037 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8039 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8041 // Handle the easy cases.
8042 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8047 // Determine shuffle input and normalize the mask.
8048 SDValue V = M < Size ? V1 : V2;
8051 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8052 if (V.getOpcode() != ISD::BUILD_VECTOR)
8055 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8056 // the (larger) source element must be UNDEF/ZERO.
8057 if ((Size % V.getNumOperands()) == 0) {
8058 int Scale = Size / V->getNumOperands();
8059 SDValue Op = V.getOperand(M / Scale);
8060 if (Op.isUndef() || X86::isZeroNode(Op))
8062 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8063 APInt Val = Cst->getAPIntValue();
8064 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
8065 Val = Val.getLoBits(ScalarSizeInBits);
8066 Zeroable[i] = (Val == 0);
8067 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8068 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8069 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
8070 Val = Val.getLoBits(ScalarSizeInBits);
8071 Zeroable[i] = (Val == 0);
8076 // If the BUILD_VECTOR has more elements then all the (smaller) source
8077 // elements must be UNDEF or ZERO.
8078 if ((V.getNumOperands() % Size) == 0) {
8079 int Scale = V->getNumOperands() / Size;
8080 bool AllZeroable = true;
8081 for (int j = 0; j < Scale; ++j) {
8082 SDValue Op = V.getOperand((M * Scale) + j);
8083 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8085 Zeroable[i] = AllZeroable;
8093 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8094 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8095 ArrayRef<int> Mask, SDValue V1,
8097 const SmallBitVector &Zeroable,
8098 const X86Subtarget &Subtarget,
8099 SelectionDAG &DAG) {
8100 int Size = Mask.size();
8101 int LaneSize = 128 / VT.getScalarSizeInBits();
8102 const int NumBytes = VT.getSizeInBits() / 8;
8103 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8105 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8106 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8107 (Subtarget.hasBWI() && VT.is512BitVector()));
8109 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8110 // Sign bit set in i8 mask means zero element.
8111 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8114 for (int i = 0; i < NumBytes; ++i) {
8115 int M = Mask[i / NumEltBytes];
8117 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8120 if (Zeroable[i / NumEltBytes]) {
8121 PSHUFBMask[i] = ZeroMask;
8125 // We can only use a single input of V1 or V2.
8126 SDValue SrcV = (M >= Size ? V2 : V1);
8132 // PSHUFB can't cross lanes, ensure this doesn't happen.
8133 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8137 M = M * NumEltBytes + (i % NumEltBytes);
8138 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8140 assert(V && "Failed to find a source input");
8142 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8143 return DAG.getBitcast(
8144 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8145 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8148 // X86 has dedicated unpack instructions that can handle specific blend
8149 // operations: UNPCKH and UNPCKL.
8150 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8151 ArrayRef<int> Mask, SDValue V1,
8152 SDValue V2, SelectionDAG &DAG) {
8153 SmallVector<int, 8> Unpckl;
8154 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8155 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8156 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8158 SmallVector<int, 8> Unpckh;
8159 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8160 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8161 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8163 // Commute and try again.
8164 ShuffleVectorSDNode::commuteMask(Unpckl);
8165 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8166 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8168 ShuffleVectorSDNode::commuteMask(Unpckh);
8169 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8170 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8175 /// \brief Try to emit a bitmask instruction for a shuffle.
8177 /// This handles cases where we can model a blend exactly as a bitmask due to
8178 /// one of the inputs being zeroable.
8179 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8180 SDValue V2, ArrayRef<int> Mask,
8181 const SmallBitVector &Zeroable,
8182 SelectionDAG &DAG) {
8183 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8184 MVT EltVT = VT.getVectorElementType();
8185 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8187 DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
8188 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8190 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8193 if (Mask[i] % Size != i)
8194 return SDValue(); // Not a blend.
8196 V = Mask[i] < Size ? V1 : V2;
8197 else if (V != (Mask[i] < Size ? V1 : V2))
8198 return SDValue(); // Can only let one input through the mask.
8200 VMaskOps[i] = AllOnes;
8203 return SDValue(); // No non-zeroable elements!
8205 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8206 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8209 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8211 /// This is used as a fallback approach when first class blend instructions are
8212 /// unavailable. Currently it is only suitable for integer vectors, but could
8213 /// be generalized for floating point vectors if desirable.
8214 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8215 SDValue V2, ArrayRef<int> Mask,
8216 SelectionDAG &DAG) {
8217 assert(VT.isInteger() && "Only supports integer vector types!");
8218 MVT EltVT = VT.getVectorElementType();
8219 int NumEltBits = EltVT.getSizeInBits();
8220 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8221 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
8223 SmallVector<SDValue, 16> MaskOps;
8224 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8225 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8226 return SDValue(); // Shuffled input!
8227 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8230 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8231 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8232 // We have to cast V2 around.
8233 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8234 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8235 DAG.getBitcast(MaskVT, V1Mask),
8236 DAG.getBitcast(MaskVT, V2)));
8237 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8240 /// \brief Try to emit a blend instruction for a shuffle.
8242 /// This doesn't do any checks for the availability of instructions for blending
8243 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8244 /// be matched in the backend with the type given. What it does check for is
8245 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8246 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8247 SDValue V2, ArrayRef<int> Original,
8248 const SmallBitVector &Zeroable,
8249 const X86Subtarget &Subtarget,
8250 SelectionDAG &DAG) {
8251 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8252 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8253 SmallVector<int, 8> Mask(Original.begin(), Original.end());
8254 bool ForceV1Zero = false, ForceV2Zero = false;
8256 // Attempt to generate the binary blend mask. If an input is zero then
8257 // we can use any lane.
8258 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8259 unsigned BlendMask = 0;
8260 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8266 if (M == i + Size) {
8267 BlendMask |= 1u << i;
8278 BlendMask |= 1u << i;
8283 return SDValue(); // Shuffled input!
8286 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8288 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8290 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8292 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
8293 unsigned ScaledMask = 0;
8294 for (int i = 0; i != Size; ++i)
8295 if (BlendMask & (1u << i))
8296 for (int j = 0; j != Scale; ++j)
8297 ScaledMask |= 1u << (i * Scale + j);
8301 switch (VT.SimpleTy) {
8306 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8307 DAG.getConstant(BlendMask, DL, MVT::i8));
8311 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8315 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8316 // that instruction.
8317 if (Subtarget.hasAVX2()) {
8318 // Scale the blend by the number of 32-bit dwords per element.
8319 int Scale = VT.getScalarSizeInBits() / 32;
8320 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8321 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8322 V1 = DAG.getBitcast(BlendVT, V1);
8323 V2 = DAG.getBitcast(BlendVT, V2);
8324 return DAG.getBitcast(
8325 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8326 DAG.getConstant(BlendMask, DL, MVT::i8)));
8330 // For integer shuffles we need to expand the mask and cast the inputs to
8331 // v8i16s prior to blending.
8332 int Scale = 8 / VT.getVectorNumElements();
8333 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8334 V1 = DAG.getBitcast(MVT::v8i16, V1);
8335 V2 = DAG.getBitcast(MVT::v8i16, V2);
8336 return DAG.getBitcast(VT,
8337 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8338 DAG.getConstant(BlendMask, DL, MVT::i8)));
8342 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8343 SmallVector<int, 8> RepeatedMask;
8344 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8345 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8346 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8348 for (int i = 0; i < 8; ++i)
8349 if (RepeatedMask[i] >= 8)
8350 BlendMask |= 1u << i;
8351 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8352 DAG.getConstant(BlendMask, DL, MVT::i8));
8358 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8359 "256-bit byte-blends require AVX2 support!");
8361 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8362 if (SDValue Masked =
8363 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8366 // Scale the blend by the number of bytes per element.
8367 int Scale = VT.getScalarSizeInBits() / 8;
8369 // This form of blend is always done on bytes. Compute the byte vector
8371 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8373 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8374 // mix of LLVM's code generator and the x86 backend. We tell the code
8375 // generator that boolean values in the elements of an x86 vector register
8376 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8377 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8378 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8379 // of the element (the remaining are ignored) and 0 in that high bit would
8380 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8381 // the LLVM model for boolean values in vector elements gets the relevant
8382 // bit set, it is set backwards and over constrained relative to x86's
8384 SmallVector<SDValue, 32> VSELECTMask;
8385 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8386 for (int j = 0; j < Scale; ++j)
8387 VSELECTMask.push_back(
8388 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8389 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8392 V1 = DAG.getBitcast(BlendVT, V1);
8393 V2 = DAG.getBitcast(BlendVT, V2);
8394 return DAG.getBitcast(
8395 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8396 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8400 llvm_unreachable("Not a supported integer vector type!");
8404 /// \brief Try to lower as a blend of elements from two inputs followed by
8405 /// a single-input permutation.
8407 /// This matches the pattern where we can blend elements from two inputs and
8408 /// then reduce the shuffle to a single-input permutation.
8409 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8410 SDValue V1, SDValue V2,
8412 SelectionDAG &DAG) {
8413 // We build up the blend mask while checking whether a blend is a viable way
8414 // to reduce the shuffle.
8415 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8416 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8418 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8422 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8424 if (BlendMask[Mask[i] % Size] < 0)
8425 BlendMask[Mask[i] % Size] = Mask[i];
8426 else if (BlendMask[Mask[i] % Size] != Mask[i])
8427 return SDValue(); // Can't blend in the needed input!
8429 PermuteMask[i] = Mask[i] % Size;
8432 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8433 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8436 /// \brief Generic routine to decompose a shuffle and blend into indepndent
8437 /// blends and permutes.
8439 /// This matches the extremely common pattern for handling combined
8440 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8441 /// operations. It will try to pick the best arrangement of shuffles and
8443 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8447 SelectionDAG &DAG) {
8448 // Shuffle the input elements into the desired positions in V1 and V2 and
8449 // blend them together.
8450 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8451 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8452 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8453 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8454 if (Mask[i] >= 0 && Mask[i] < Size) {
8455 V1Mask[i] = Mask[i];
8457 } else if (Mask[i] >= Size) {
8458 V2Mask[i] = Mask[i] - Size;
8459 BlendMask[i] = i + Size;
8462 // Try to lower with the simpler initial blend strategy unless one of the
8463 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8464 // shuffle may be able to fold with a load or other benefit. However, when
8465 // we'll have to do 2x as many shuffles in order to achieve this, blending
8466 // first is a better strategy.
8467 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8468 if (SDValue BlendPerm =
8469 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8472 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8473 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8474 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8477 /// \brief Try to lower a vector shuffle as a rotation.
8479 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8480 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8481 ArrayRef<int> Mask) {
8482 int NumElts = Mask.size();
8484 // We need to detect various ways of spelling a rotation:
8485 // [11, 12, 13, 14, 15, 0, 1, 2]
8486 // [-1, 12, 13, 14, -1, -1, 1, -1]
8487 // [-1, -1, -1, -1, -1, -1, 1, 2]
8488 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8489 // [-1, 4, 5, 6, -1, -1, 9, -1]
8490 // [-1, 4, 5, 6, -1, -1, -1, -1]
8493 for (int i = 0; i < NumElts; ++i) {
8495 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8496 "Unexpected mask index.");
8500 // Determine where a rotated vector would have started.
8501 int StartIdx = i - (M % NumElts);
8503 // The identity rotation isn't interesting, stop.
8506 // If we found the tail of a vector the rotation must be the missing
8507 // front. If we found the head of a vector, it must be how much of the
8509 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8512 Rotation = CandidateRotation;
8513 else if (Rotation != CandidateRotation)
8514 // The rotations don't match, so we can't match this mask.
8517 // Compute which value this mask is pointing at.
8518 SDValue MaskV = M < NumElts ? V1 : V2;
8520 // Compute which of the two target values this index should be assigned
8521 // to. This reflects whether the high elements are remaining or the low
8522 // elements are remaining.
8523 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8525 // Either set up this value if we've not encountered it before, or check
8526 // that it remains consistent.
8529 else if (TargetV != MaskV)
8530 // This may be a rotation, but it pulls from the inputs in some
8531 // unsupported interleaving.
8535 // Check that we successfully analyzed the mask, and normalize the results.
8536 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8537 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8549 /// \brief Try to lower a vector shuffle as a byte rotation.
8551 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8552 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8553 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8554 /// try to generically lower a vector shuffle through such an pattern. It
8555 /// does not check for the profitability of lowering either as PALIGNR or
8556 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8557 /// This matches shuffle vectors that look like:
8559 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8561 /// Essentially it concatenates V1 and V2, shifts right by some number of
8562 /// elements, and takes the low elements as the result. Note that while this is
8563 /// specified as a *right shift* because x86 is little-endian, it is a *left
8564 /// rotate* of the vector lanes.
8565 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8566 ArrayRef<int> Mask) {
8567 // Don't accept any shuffles with zero elements.
8568 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8571 // PALIGNR works on 128-bit lanes.
8572 SmallVector<int, 16> RepeatedMask;
8573 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8576 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8580 // PALIGNR rotates bytes, so we need to scale the
8581 // rotation based on how many bytes are in the vector lane.
8582 int NumElts = RepeatedMask.size();
8583 int Scale = 16 / NumElts;
8584 return Rotation * Scale;
8587 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8588 SDValue V1, SDValue V2,
8590 const X86Subtarget &Subtarget,
8591 SelectionDAG &DAG) {
8592 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8594 SDValue Lo = V1, Hi = V2;
8595 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8596 if (ByteRotation <= 0)
8599 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8601 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8602 Lo = DAG.getBitcast(ByteVT, Lo);
8603 Hi = DAG.getBitcast(ByteVT, Hi);
8605 // SSSE3 targets can use the palignr instruction.
8606 if (Subtarget.hasSSSE3()) {
8607 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8608 "512-bit PALIGNR requires BWI instructions");
8609 return DAG.getBitcast(
8610 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8611 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8614 assert(VT.is128BitVector() &&
8615 "Rotate-based lowering only supports 128-bit lowering!");
8616 assert(Mask.size() <= 16 &&
8617 "Can shuffle at most 16 bytes in a 128-bit vector!");
8618 assert(ByteVT == MVT::v16i8 &&
8619 "SSE2 rotate lowering only needed for v16i8!");
8621 // Default SSE2 implementation
8622 int LoByteShift = 16 - ByteRotation;
8623 int HiByteShift = ByteRotation;
8625 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
8626 DAG.getConstant(LoByteShift, DL, MVT::i8));
8627 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
8628 DAG.getConstant(HiByteShift, DL, MVT::i8));
8629 return DAG.getBitcast(VT,
8630 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
8633 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
8635 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
8636 /// rotation of the concatenation of two vectors; This routine will
8637 /// try to generically lower a vector shuffle through such an pattern.
8639 /// Essentially it concatenates V1 and V2, shifts right by some number of
8640 /// elements, and takes the low elements as the result. Note that while this is
8641 /// specified as a *right shift* because x86 is little-endian, it is a *left
8642 /// rotate* of the vector lanes.
8643 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
8644 SDValue V1, SDValue V2,
8646 const X86Subtarget &Subtarget,
8647 SelectionDAG &DAG) {
8648 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
8649 "Only 32-bit and 64-bit elements are supported!");
8651 // 128/256-bit vectors are only supported with VLX.
8652 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
8653 && "VLX required for 128/256-bit vectors");
8655 SDValue Lo = V1, Hi = V2;
8656 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
8660 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
8661 DAG.getConstant(Rotation, DL, MVT::i8));
8664 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
8666 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
8667 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
8668 /// matches elements from one of the input vectors shuffled to the left or
8669 /// right with zeroable elements 'shifted in'. It handles both the strictly
8670 /// bit-wise element shifts and the byte shift across an entire 128-bit double
8673 /// PSHL : (little-endian) left bit shift.
8674 /// [ zz, 0, zz, 2 ]
8675 /// [ -1, 4, zz, -1 ]
8676 /// PSRL : (little-endian) right bit shift.
8678 /// [ -1, -1, 7, zz]
8679 /// PSLLDQ : (little-endian) left byte shift
8680 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
8681 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
8682 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
8683 /// PSRLDQ : (little-endian) right byte shift
8684 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
8685 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
8686 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
8687 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
8688 unsigned ScalarSizeInBits,
8689 ArrayRef<int> Mask, int MaskOffset,
8690 const SmallBitVector &Zeroable,
8691 const X86Subtarget &Subtarget) {
8692 int Size = Mask.size();
8693 unsigned SizeInBits = Size * ScalarSizeInBits;
8695 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
8696 for (int i = 0; i < Size; i += Scale)
8697 for (int j = 0; j < Shift; ++j)
8698 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
8704 auto MatchShift = [&](int Shift, int Scale, bool Left) {
8705 for (int i = 0; i != Size; i += Scale) {
8706 unsigned Pos = Left ? i + Shift : i;
8707 unsigned Low = Left ? i : i + Shift;
8708 unsigned Len = Scale - Shift;
8709 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
8713 int ShiftEltBits = ScalarSizeInBits * Scale;
8714 bool ByteShift = ShiftEltBits > 64;
8715 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
8716 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
8717 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
8719 // Normalize the scale for byte shifts to still produce an i64 element
8721 Scale = ByteShift ? Scale / 2 : Scale;
8723 // We need to round trip through the appropriate type for the shift.
8724 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
8725 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
8726 : MVT::getVectorVT(ShiftSVT, Size / Scale);
8727 return (int)ShiftAmt;
8730 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
8731 // keep doubling the size of the integer elements up to that. We can
8732 // then shift the elements of the integer vector by whole multiples of
8733 // their width within the elements of the larger integer vector. Test each
8734 // multiple to see if we can find a match with the moved element indices
8735 // and that the shifted in elements are all zeroable.
8736 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
8737 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
8738 for (int Shift = 1; Shift != Scale; ++Shift)
8739 for (bool Left : {true, false})
8740 if (CheckZeros(Shift, Scale, Left)) {
8741 int ShiftAmt = MatchShift(Shift, Scale, Left);
8750 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
8751 SDValue V2, ArrayRef<int> Mask,
8752 const SmallBitVector &Zeroable,
8753 const X86Subtarget &Subtarget,
8754 SelectionDAG &DAG) {
8755 int Size = Mask.size();
8756 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8762 // Try to match shuffle against V1 shift.
8763 int ShiftAmt = matchVectorShuffleAsShift(
8764 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
8766 // If V1 failed, try to match shuffle against V2 shift.
8769 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
8770 Mask, Size, Zeroable, Subtarget);
8777 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
8778 "Illegal integer vector type");
8779 V = DAG.getBitcast(ShiftVT, V);
8780 V = DAG.getNode(Opcode, DL, ShiftVT, V,
8781 DAG.getConstant(ShiftAmt, DL, MVT::i8));
8782 return DAG.getBitcast(VT, V);
8785 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
8786 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
8787 SDValue V2, ArrayRef<int> Mask,
8788 const SmallBitVector &Zeroable,
8789 SelectionDAG &DAG) {
8790 int Size = Mask.size();
8791 int HalfSize = Size / 2;
8792 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8793 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
8795 // Upper half must be undefined.
8796 if (!isUndefInRange(Mask, HalfSize, HalfSize))
8799 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
8800 // Remainder of lower half result is zero and upper half is all undef.
8801 auto LowerAsEXTRQ = [&]() {
8802 // Determine the extraction length from the part of the
8803 // lower half that isn't zeroable.
8805 for (; Len > 0; --Len)
8806 if (!Zeroable[Len - 1])
8808 assert(Len > 0 && "Zeroable shuffle mask");
8810 // Attempt to match first Len sequential elements from the lower half.
8813 for (int i = 0; i != Len; ++i) {
8817 SDValue &V = (M < Size ? V1 : V2);
8820 // The extracted elements must start at a valid index and all mask
8821 // elements must be in the lower half.
8822 if (i > M || M >= HalfSize)
8825 if (Idx < 0 || (Src == V && Idx == (M - i))) {
8836 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
8837 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8838 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8839 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
8840 DAG.getConstant(BitLen, DL, MVT::i8),
8841 DAG.getConstant(BitIdx, DL, MVT::i8));
8844 if (SDValue ExtrQ = LowerAsEXTRQ())
8847 // INSERTQ: Extract lowest Len elements from lower half of second source and
8848 // insert over first source, starting at Idx.
8849 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
8850 auto LowerAsInsertQ = [&]() {
8851 for (int Idx = 0; Idx != HalfSize; ++Idx) {
8854 // Attempt to match first source from mask before insertion point.
8855 if (isUndefInRange(Mask, 0, Idx)) {
8857 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
8859 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
8865 // Extend the extraction length looking to match both the insertion of
8866 // the second source and the remaining elements of the first.
8867 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8872 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8874 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8880 // Match the remaining elements of the lower half.
8881 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8883 } else if ((!Base || (Base == V1)) &&
8884 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8886 } else if ((!Base || (Base == V2)) &&
8887 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8894 // We may not have a base (first source) - this can safely be undefined.
8896 Base = DAG.getUNDEF(VT);
8898 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8899 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8900 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8901 DAG.getConstant(BitLen, DL, MVT::i8),
8902 DAG.getConstant(BitIdx, DL, MVT::i8));
8909 if (SDValue InsertQ = LowerAsInsertQ())
8915 /// \brief Lower a vector shuffle as a zero or any extension.
8917 /// Given a specific number of elements, element bit width, and extension
8918 /// stride, produce either a zero or any extension based on the available
8919 /// features of the subtarget. The extended elements are consecutive and
8920 /// begin and can start from an offseted element index in the input; to
8921 /// avoid excess shuffling the offset must either being in the bottom lane
8922 /// or at the start of a higher lane. All extended elements must be from
8924 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8925 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8926 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8927 assert(Scale > 1 && "Need a scale to extend.");
8928 int EltBits = VT.getScalarSizeInBits();
8929 int NumElements = VT.getVectorNumElements();
8930 int NumEltsPerLane = 128 / EltBits;
8931 int OffsetLane = Offset / NumEltsPerLane;
8932 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8933 "Only 8, 16, and 32 bit elements can be extended.");
8934 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8935 assert(0 <= Offset && "Extension offset must be positive.");
8936 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8937 "Extension offset must be in the first lane or start an upper lane.");
8939 // Check that an index is in same lane as the base offset.
8940 auto SafeOffset = [&](int Idx) {
8941 return OffsetLane == (Idx / NumEltsPerLane);
8944 // Shift along an input so that the offset base moves to the first element.
8945 auto ShuffleOffset = [&](SDValue V) {
8949 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8950 for (int i = 0; i * Scale < NumElements; ++i) {
8951 int SrcIdx = i + Offset;
8952 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8954 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8957 // Found a valid zext mask! Try various lowering strategies based on the
8958 // input type and available ISA extensions.
8959 if (Subtarget.hasSSE41()) {
8960 // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8961 // PUNPCK will catch this in a later shuffle match.
8962 if (Offset && Scale == 2 && VT.is128BitVector())
8964 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8965 NumElements / Scale);
8966 InputV = ShuffleOffset(InputV);
8968 // For 256-bit vectors, we only need the lower (128-bit) input half.
8969 // For 512-bit vectors, we only need the lower input half or quarter.
8970 if (VT.getSizeInBits() > 128)
8971 InputV = extractSubVector(InputV, 0, DAG, DL,
8972 std::max(128, (int)VT.getSizeInBits() / Scale));
8974 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8975 return DAG.getBitcast(VT, InputV);
8978 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8980 // For any extends we can cheat for larger element sizes and use shuffle
8981 // instructions that can fold with a load and/or copy.
8982 if (AnyExt && EltBits == 32) {
8983 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8985 return DAG.getBitcast(
8986 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8987 DAG.getBitcast(MVT::v4i32, InputV),
8988 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8990 if (AnyExt && EltBits == 16 && Scale > 2) {
8991 int PSHUFDMask[4] = {Offset / 2, -1,
8992 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8993 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8994 DAG.getBitcast(MVT::v4i32, InputV),
8995 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8996 int PSHUFWMask[4] = {1, -1, -1, -1};
8997 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8998 return DAG.getBitcast(
8999 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9000 DAG.getBitcast(MVT::v8i16, InputV),
9001 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9004 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9006 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9007 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9008 assert(VT.is128BitVector() && "Unexpected vector width!");
9010 int LoIdx = Offset * EltBits;
9011 SDValue Lo = DAG.getBitcast(
9012 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9013 DAG.getConstant(EltBits, DL, MVT::i8),
9014 DAG.getConstant(LoIdx, DL, MVT::i8)));
9016 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9017 !SafeOffset(Offset + 1))
9018 return DAG.getBitcast(VT, Lo);
9020 int HiIdx = (Offset + 1) * EltBits;
9021 SDValue Hi = DAG.getBitcast(
9022 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9023 DAG.getConstant(EltBits, DL, MVT::i8),
9024 DAG.getConstant(HiIdx, DL, MVT::i8)));
9025 return DAG.getBitcast(VT,
9026 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9029 // If this would require more than 2 unpack instructions to expand, use
9030 // pshufb when available. We can only use more than 2 unpack instructions
9031 // when zero extending i8 elements which also makes it easier to use pshufb.
9032 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9033 assert(NumElements == 16 && "Unexpected byte vector width!");
9034 SDValue PSHUFBMask[16];
9035 for (int i = 0; i < 16; ++i) {
9036 int Idx = Offset + (i / Scale);
9037 PSHUFBMask[i] = DAG.getConstant(
9038 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9040 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9041 return DAG.getBitcast(
9042 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9043 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9046 // If we are extending from an offset, ensure we start on a boundary that
9047 // we can unpack from.
9048 int AlignToUnpack = Offset % (NumElements / Scale);
9049 if (AlignToUnpack) {
9050 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9051 for (int i = AlignToUnpack; i < NumElements; ++i)
9052 ShMask[i - AlignToUnpack] = i;
9053 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9054 Offset -= AlignToUnpack;
9057 // Otherwise emit a sequence of unpacks.
9059 unsigned UnpackLoHi = X86ISD::UNPCKL;
9060 if (Offset >= (NumElements / 2)) {
9061 UnpackLoHi = X86ISD::UNPCKH;
9062 Offset -= (NumElements / 2);
9065 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9066 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9067 : getZeroVector(InputVT, Subtarget, DAG, DL);
9068 InputV = DAG.getBitcast(InputVT, InputV);
9069 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9073 } while (Scale > 1);
9074 return DAG.getBitcast(VT, InputV);
9077 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9079 /// This routine will try to do everything in its power to cleverly lower
9080 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9081 /// check for the profitability of this lowering, it tries to aggressively
9082 /// match this pattern. It will use all of the micro-architectural details it
9083 /// can to emit an efficient lowering. It handles both blends with all-zero
9084 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9085 /// masking out later).
9087 /// The reason we have dedicated lowering for zext-style shuffles is that they
9088 /// are both incredibly common and often quite performance sensitive.
9089 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9090 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9091 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9092 SelectionDAG &DAG) {
9093 int Bits = VT.getSizeInBits();
9094 int NumLanes = Bits / 128;
9095 int NumElements = VT.getVectorNumElements();
9096 int NumEltsPerLane = NumElements / NumLanes;
9097 assert(VT.getScalarSizeInBits() <= 32 &&
9098 "Exceeds 32-bit integer zero extension limit");
9099 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9101 // Define a helper function to check a particular ext-scale and lower to it if
9103 auto Lower = [&](int Scale) -> SDValue {
9108 for (int i = 0; i < NumElements; ++i) {
9111 continue; // Valid anywhere but doesn't tell us anything.
9112 if (i % Scale != 0) {
9113 // Each of the extended elements need to be zeroable.
9117 // We no longer are in the anyext case.
9122 // Each of the base elements needs to be consecutive indices into the
9123 // same input vector.
9124 SDValue V = M < NumElements ? V1 : V2;
9125 M = M % NumElements;
9128 Offset = M - (i / Scale);
9129 } else if (InputV != V)
9130 return SDValue(); // Flip-flopping inputs.
9132 // Offset must start in the lowest 128-bit lane or at the start of an
9134 // FIXME: Is it ever worth allowing a negative base offset?
9135 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9136 (Offset % NumEltsPerLane) == 0))
9139 // If we are offsetting, all referenced entries must come from the same
9141 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9144 if ((M % NumElements) != (Offset + (i / Scale)))
9145 return SDValue(); // Non-consecutive strided elements.
9149 // If we fail to find an input, we have a zero-shuffle which should always
9150 // have already been handled.
9151 // FIXME: Maybe handle this here in case during blending we end up with one?
9155 // If we are offsetting, don't extend if we only match a single input, we
9156 // can always do better by using a basic PSHUF or PUNPCK.
9157 if (Offset != 0 && Matches < 2)
9160 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9161 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9164 // The widest scale possible for extending is to a 64-bit integer.
9165 assert(Bits % 64 == 0 &&
9166 "The number of bits in a vector must be divisible by 64 on x86!");
9167 int NumExtElements = Bits / 64;
9169 // Each iteration, try extending the elements half as much, but into twice as
9171 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9172 assert(NumElements % NumExtElements == 0 &&
9173 "The input vector size must be divisible by the extended size.");
9174 if (SDValue V = Lower(NumElements / NumExtElements))
9178 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9182 // Returns one of the source operands if the shuffle can be reduced to a
9183 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9184 auto CanZExtLowHalf = [&]() {
9185 for (int i = NumElements / 2; i != NumElements; ++i)
9188 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9190 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9195 if (SDValue V = CanZExtLowHalf()) {
9196 V = DAG.getBitcast(MVT::v2i64, V);
9197 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9198 return DAG.getBitcast(VT, V);
9201 // No viable ext lowering found.
9205 /// \brief Try to get a scalar value for a specific element of a vector.
9207 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9208 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9209 SelectionDAG &DAG) {
9210 MVT VT = V.getSimpleValueType();
9211 MVT EltVT = VT.getVectorElementType();
9212 V = peekThroughBitcasts(V);
9214 // If the bitcasts shift the element size, we can't extract an equivalent
9216 MVT NewVT = V.getSimpleValueType();
9217 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9220 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9221 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9222 // Ensure the scalar operand is the same size as the destination.
9223 // FIXME: Add support for scalar truncation where possible.
9224 SDValue S = V.getOperand(Idx);
9225 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9226 return DAG.getBitcast(EltVT, S);
9232 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9234 /// This is particularly important because the set of instructions varies
9235 /// significantly based on whether the operand is a load or not.
9236 static bool isShuffleFoldableLoad(SDValue V) {
9237 V = peekThroughBitcasts(V);
9238 return ISD::isNON_EXTLoad(V.getNode());
9241 /// \brief Try to lower insertion of a single element into a zero vector.
9243 /// This is a common pattern that we have especially efficient patterns to lower
9244 /// across all subtarget feature sets.
9245 static SDValue lowerVectorShuffleAsElementInsertion(
9246 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9247 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9248 SelectionDAG &DAG) {
9250 MVT EltVT = VT.getVectorElementType();
9253 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9255 bool IsV1Zeroable = true;
9256 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9257 if (i != V2Index && !Zeroable[i]) {
9258 IsV1Zeroable = false;
9262 // Check for a single input from a SCALAR_TO_VECTOR node.
9263 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9264 // all the smarts here sunk into that routine. However, the current
9265 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9266 // vector shuffle lowering is dead.
9267 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9269 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9270 // We need to zext the scalar if it is smaller than an i32.
9271 V2S = DAG.getBitcast(EltVT, V2S);
9272 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9273 // Using zext to expand a narrow element won't work for non-zero
9278 // Zero-extend directly to i32.
9280 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9282 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9283 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9284 EltVT == MVT::i16) {
9285 // Either not inserting from the low element of the input or the input
9286 // element size is too small to use VZEXT_MOVL to clear the high bits.
9290 if (!IsV1Zeroable) {
9291 // If V1 can't be treated as a zero vector we have fewer options to lower
9292 // this. We can't support integer vectors or non-zero targets cheaply, and
9293 // the V1 elements can't be permuted in any way.
9294 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9295 if (!VT.isFloatingPoint() || V2Index != 0)
9297 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9298 V1Mask[V2Index] = -1;
9299 if (!isNoopShuffleMask(V1Mask))
9301 // This is essentially a special case blend operation, but if we have
9302 // general purpose blend operations, they are always faster. Bail and let
9303 // the rest of the lowering handle these as blends.
9304 if (Subtarget.hasSSE41())
9307 // Otherwise, use MOVSD or MOVSS.
9308 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9309 "Only two types of floating point element types to handle!");
9310 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9314 // This lowering only works for the low element with floating point vectors.
9315 if (VT.isFloatingPoint() && V2Index != 0)
9318 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9320 V2 = DAG.getBitcast(VT, V2);
9323 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9324 // the desired position. Otherwise it is more efficient to do a vector
9325 // shift left. We know that we can do a vector shift left because all
9326 // the inputs are zero.
9327 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9328 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9329 V2Shuffle[V2Index] = 0;
9330 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9332 V2 = DAG.getBitcast(MVT::v16i8, V2);
9334 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9335 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9336 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9337 DAG.getDataLayout(), VT)));
9338 V2 = DAG.getBitcast(VT, V2);
9344 /// Try to lower broadcast of a single - truncated - integer element,
9345 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9347 /// This assumes we have AVX2.
9348 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9349 SDValue V0, int BroadcastIdx,
9350 const X86Subtarget &Subtarget,
9351 SelectionDAG &DAG) {
9352 assert(Subtarget.hasAVX2() &&
9353 "We can only lower integer broadcasts with AVX2!");
9355 EVT EltVT = VT.getVectorElementType();
9356 EVT V0VT = V0.getValueType();
9358 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9359 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9361 EVT V0EltVT = V0VT.getVectorElementType();
9362 if (!V0EltVT.isInteger())
9365 const unsigned EltSize = EltVT.getSizeInBits();
9366 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9368 // This is only a truncation if the original element type is larger.
9369 if (V0EltSize <= EltSize)
9372 assert(((V0EltSize % EltSize) == 0) &&
9373 "Scalar type sizes must all be powers of 2 on x86!");
9375 const unsigned V0Opc = V0.getOpcode();
9376 const unsigned Scale = V0EltSize / EltSize;
9377 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9379 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9380 V0Opc != ISD::BUILD_VECTOR)
9383 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9385 // If we're extracting non-least-significant bits, shift so we can truncate.
9386 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9387 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9388 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9389 if (const int OffsetIdx = BroadcastIdx % Scale)
9390 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9391 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9393 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9394 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9397 /// \brief Try to lower broadcast of a single element.
9399 /// For convenience, this code also bundles all of the subtarget feature set
9400 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9401 /// a convenient way to factor it out.
9402 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
9403 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9404 SDValue V1, SDValue V2,
9406 const X86Subtarget &Subtarget,
9407 SelectionDAG &DAG) {
9408 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9409 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9410 (Subtarget.hasAVX2() && VT.isInteger())))
9413 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9414 // we can only broadcast from a register with AVX2.
9415 unsigned NumElts = Mask.size();
9416 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9417 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9419 // Check that the mask is a broadcast.
9420 int BroadcastIdx = -1;
9421 for (int i = 0; i != (int)NumElts; ++i) {
9422 SmallVector<int, 8> BroadcastMask(NumElts, i);
9423 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9429 if (BroadcastIdx < 0)
9431 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9432 "a sorted mask where the broadcast "
9435 // Go up the chain of (vector) values to find a scalar load that we can
9436 // combine with the broadcast.
9439 switch (V.getOpcode()) {
9440 case ISD::BITCAST: {
9441 SDValue VSrc = V.getOperand(0);
9442 MVT SrcVT = VSrc.getSimpleValueType();
9443 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9448 case ISD::CONCAT_VECTORS: {
9449 int OperandSize = Mask.size() / V.getNumOperands();
9450 V = V.getOperand(BroadcastIdx / OperandSize);
9451 BroadcastIdx %= OperandSize;
9454 case ISD::INSERT_SUBVECTOR: {
9455 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9456 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9460 int BeginIdx = (int)ConstantIdx->getZExtValue();
9462 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9463 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9464 BroadcastIdx -= BeginIdx;
9475 // Check if this is a broadcast of a scalar. We special case lowering
9476 // for scalars so that we can more effectively fold with loads.
9477 // First, look through bitcast: if the original value has a larger element
9478 // type than the shuffle, the broadcast element is in essence truncated.
9479 // Make that explicit to ease folding.
9480 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9481 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9482 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9483 return TruncBroadcast;
9485 MVT BroadcastVT = VT;
9487 // Peek through any bitcast (only useful for loads).
9488 SDValue BC = peekThroughBitcasts(V);
9490 // Also check the simpler case, where we can directly reuse the scalar.
9491 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9492 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9493 V = V.getOperand(BroadcastIdx);
9495 // If we can't broadcast from a register, check that the input is a load.
9496 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9498 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9499 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9500 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9501 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9502 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9505 // If we are broadcasting a load that is only used by the shuffle
9506 // then we can reduce the vector load to the broadcasted scalar load.
9507 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9508 SDValue BaseAddr = Ld->getOperand(1);
9509 EVT SVT = BroadcastVT.getScalarType();
9510 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9511 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9512 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9513 DAG.getMachineFunction().getMachineMemOperand(
9514 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9516 // Make sure the newly-created LOAD is in the same position as Ld in
9517 // terms of dependency. We create a TokenFactor for Ld and V,
9518 // and update uses of Ld's output chain to use the TokenFactor.
9519 if (Ld->hasAnyUseOfValue(1)) {
9520 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9521 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9522 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9523 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9524 SDValue(V.getNode(), 1));
9526 } else if (!BroadcastFromReg) {
9527 // We can't broadcast from a vector register.
9529 } else if (BroadcastIdx != 0) {
9530 // We can only broadcast from the zero-element of a vector register,
9531 // but it can be advantageous to broadcast from the zero-element of a
9533 if (!VT.is256BitVector() && !VT.is512BitVector())
9536 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9537 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9540 // Only broadcast the zero-element of a 128-bit subvector.
9541 unsigned EltSize = VT.getScalarSizeInBits();
9542 if (((BroadcastIdx * EltSize) % 128) != 0)
9545 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
9546 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9547 DAG.getIntPtrConstant(BroadcastIdx, DL));
9550 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9551 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9552 DAG.getBitcast(MVT::f64, V));
9554 // Bitcast back to the same scalar type as BroadcastVT.
9555 MVT SrcVT = V.getSimpleValueType();
9556 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9557 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9558 "Unexpected vector element size");
9559 if (SrcVT.isVector()) {
9560 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9561 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9563 SrcVT = BroadcastVT.getScalarType();
9565 V = DAG.getBitcast(SrcVT, V);
9568 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9569 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9570 V = DAG.getBitcast(MVT::f64, V);
9571 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9572 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9575 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9578 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9579 // INSERTPS when the V1 elements are already in the correct locations
9580 // because otherwise we can just always use two SHUFPS instructions which
9581 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9582 // perform INSERTPS if a single V1 element is out of place and all V2
9583 // elements are zeroable.
9584 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9585 unsigned &InsertPSMask,
9586 const SmallBitVector &Zeroable,
9588 SelectionDAG &DAG) {
9589 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9590 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9591 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9593 // Attempt to match INSERTPS with one element from VA or VB being
9594 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9596 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9597 ArrayRef<int> CandidateMask) {
9599 int VADstIndex = -1;
9600 int VBDstIndex = -1;
9601 bool VAUsedInPlace = false;
9603 for (int i = 0; i < 4; ++i) {
9604 // Synthesize a zero mask from the zeroable elements (includes undefs).
9610 // Flag if we use any VA inputs in place.
9611 if (i == CandidateMask[i]) {
9612 VAUsedInPlace = true;
9616 // We can only insert a single non-zeroable element.
9617 if (VADstIndex >= 0 || VBDstIndex >= 0)
9620 if (CandidateMask[i] < 4) {
9621 // VA input out of place for insertion.
9624 // VB input for insertion.
9629 // Don't bother if we have no (non-zeroable) element for insertion.
9630 if (VADstIndex < 0 && VBDstIndex < 0)
9633 // Determine element insertion src/dst indices. The src index is from the
9634 // start of the inserted vector, not the start of the concatenated vector.
9635 unsigned VBSrcIndex = 0;
9636 if (VADstIndex >= 0) {
9637 // If we have a VA input out of place, we use VA as the V2 element
9638 // insertion and don't use the original V2 at all.
9639 VBSrcIndex = CandidateMask[VADstIndex];
9640 VBDstIndex = VADstIndex;
9643 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
9646 // If no V1 inputs are used in place, then the result is created only from
9647 // the zero mask and the V2 insertion - so remove V1 dependency.
9649 VA = DAG.getUNDEF(MVT::v4f32);
9651 // Update V1, V2 and InsertPSMask accordingly.
9655 // Insert the V2 element into the desired position.
9656 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
9657 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
9661 if (matchAsInsertPS(V1, V2, Mask))
9664 // Commute and try again.
9665 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
9666 ShuffleVectorSDNode::commuteMask(CommutedMask);
9667 if (matchAsInsertPS(V2, V1, CommutedMask))
9673 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
9674 SDValue V2, ArrayRef<int> Mask,
9675 const SmallBitVector &Zeroable,
9676 SelectionDAG &DAG) {
9677 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9678 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9680 // Attempt to match the insertps pattern.
9681 unsigned InsertPSMask;
9682 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
9685 // Insert the V2 element into the desired position.
9686 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9687 DAG.getConstant(InsertPSMask, DL, MVT::i8));
9690 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
9691 /// UNPCK instruction.
9693 /// This specifically targets cases where we end up with alternating between
9694 /// the two inputs, and so can permute them into something that feeds a single
9695 /// UNPCK instruction. Note that this routine only targets integer vectors
9696 /// because for floating point vectors we have a generalized SHUFPS lowering
9697 /// strategy that handles everything that doesn't *exactly* match an unpack,
9698 /// making this clever lowering unnecessary.
9699 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
9700 SDValue V1, SDValue V2,
9702 SelectionDAG &DAG) {
9703 assert(!VT.isFloatingPoint() &&
9704 "This routine only supports integer vectors.");
9705 assert(VT.is128BitVector() &&
9706 "This routine only works on 128-bit vectors.");
9707 assert(!V2.isUndef() &&
9708 "This routine should only be used when blending two inputs.");
9709 assert(Mask.size() >= 2 && "Single element masks are invalid.");
9711 int Size = Mask.size();
9714 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
9716 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
9718 bool UnpackLo = NumLoInputs >= NumHiInputs;
9720 auto TryUnpack = [&](int ScalarSize, int Scale) {
9721 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
9722 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
9724 for (int i = 0; i < Size; ++i) {
9728 // Each element of the unpack contains Scale elements from this mask.
9729 int UnpackIdx = i / Scale;
9731 // We only handle the case where V1 feeds the first slots of the unpack.
9732 // We rely on canonicalization to ensure this is the case.
9733 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
9736 // Setup the mask for this input. The indexing is tricky as we have to
9737 // handle the unpack stride.
9738 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
9739 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
9743 // If we will have to shuffle both inputs to use the unpack, check whether
9744 // we can just unpack first and shuffle the result. If so, skip this unpack.
9745 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
9746 !isNoopShuffleMask(V2Mask))
9749 // Shuffle the inputs into place.
9750 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9751 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9753 // Cast the inputs to the type we will use to unpack them.
9754 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
9755 V1 = DAG.getBitcast(UnpackVT, V1);
9756 V2 = DAG.getBitcast(UnpackVT, V2);
9758 // Unpack the inputs and cast the result back to the desired type.
9759 return DAG.getBitcast(
9760 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9764 // We try each unpack from the largest to the smallest to try and find one
9765 // that fits this mask.
9766 int OrigScalarSize = VT.getScalarSizeInBits();
9767 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
9768 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
9771 // If none of the unpack-rooted lowerings worked (or were profitable) try an
9773 if (NumLoInputs == 0 || NumHiInputs == 0) {
9774 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
9775 "We have to have *some* inputs!");
9776 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
9778 // FIXME: We could consider the total complexity of the permute of each
9779 // possible unpacking. Or at the least we should consider how many
9780 // half-crossings are created.
9781 // FIXME: We could consider commuting the unpacks.
9783 SmallVector<int, 32> PermMask((unsigned)Size, -1);
9784 for (int i = 0; i < Size; ++i) {
9788 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
9791 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
9793 return DAG.getVectorShuffle(
9794 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
9796 DAG.getUNDEF(VT), PermMask);
9802 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
9804 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
9805 /// support for floating point shuffles but not integer shuffles. These
9806 /// instructions will incur a domain crossing penalty on some chips though so
9807 /// it is better to avoid lowering through this for integer vectors where
9809 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9810 const SmallBitVector &Zeroable,
9811 SDValue V1, SDValue V2,
9812 const X86Subtarget &Subtarget,
9813 SelectionDAG &DAG) {
9814 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9815 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9816 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9819 // Check for being able to broadcast a single element.
9820 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9821 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
9824 // Straight shuffle of a single input vector. Simulate this by using the
9825 // single input as both of the "inputs" to this instruction..
9826 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
9828 if (Subtarget.hasAVX()) {
9829 // If we have AVX, we can use VPERMILPS which will allow folding a load
9830 // into the shuffle.
9831 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
9832 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9836 X86ISD::SHUFP, DL, MVT::v2f64,
9837 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9838 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9839 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9841 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
9842 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
9844 // If we have a single input, insert that into V1 if we can do so cheaply.
9845 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
9846 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9847 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9849 // Try inverting the insertion since for v2 masks it is easy to do and we
9850 // can't reliably sort the mask one way or the other.
9851 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
9852 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
9853 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9854 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9858 // Try to use one of the special instruction patterns to handle two common
9859 // blend patterns if a zero-blend above didn't work.
9860 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
9861 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
9862 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
9863 // We can either use a special instruction to load over the low double or
9864 // to move just the low double.
9866 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
9868 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
9870 if (Subtarget.hasSSE41())
9871 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
9872 Zeroable, Subtarget, DAG))
9875 // Use dedicated unpack instructions for masks that match their pattern.
9877 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
9880 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
9881 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
9882 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9885 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
9887 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
9888 /// the integer unit to minimize domain crossing penalties. However, for blends
9889 /// it falls back to the floating point shuffle operation with appropriate bit
9891 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9892 const SmallBitVector &Zeroable,
9893 SDValue V1, SDValue V2,
9894 const X86Subtarget &Subtarget,
9895 SelectionDAG &DAG) {
9896 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9897 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9898 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9901 // Check for being able to broadcast a single element.
9902 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9903 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9906 // Straight shuffle of a single input vector. For everything from SSE2
9907 // onward this has a single fast instruction with no scary immediates.
9908 // We have to map the mask as it is actually a v4i32 shuffle instruction.
9909 V1 = DAG.getBitcast(MVT::v4i32, V1);
9910 int WidenedMask[4] = {
9911 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9912 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9913 return DAG.getBitcast(
9915 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9916 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9918 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9919 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9920 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9921 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9923 // If we have a blend of two same-type PACKUS operations and the blend aligns
9924 // with the low and high halves, we can just merge the PACKUS operations.
9925 // This is particularly important as it lets us merge shuffles that this
9926 // routine itself creates.
9927 auto GetPackNode = [](SDValue V) {
9928 V = peekThroughBitcasts(V);
9929 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9931 if (SDValue V1Pack = GetPackNode(V1))
9932 if (SDValue V2Pack = GetPackNode(V2)) {
9933 EVT PackVT = V1Pack.getValueType();
9934 if (PackVT == V2Pack.getValueType())
9935 return DAG.getBitcast(MVT::v2i64,
9936 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9937 Mask[0] == 0 ? V1Pack.getOperand(0)
9938 : V1Pack.getOperand(1),
9939 Mask[1] == 2 ? V2Pack.getOperand(0)
9940 : V2Pack.getOperand(1)));
9943 // Try to use shift instructions.
9944 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9945 Zeroable, Subtarget, DAG))
9948 // When loading a scalar and then shuffling it into a vector we can often do
9949 // the insertion cheaply.
9950 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9951 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9953 // Try inverting the insertion since for v2 masks it is easy to do and we
9954 // can't reliably sort the mask one way or the other.
9955 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9956 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9957 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9960 // We have different paths for blend lowering, but they all must use the
9961 // *exact* same predicate.
9962 bool IsBlendSupported = Subtarget.hasSSE41();
9963 if (IsBlendSupported)
9964 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9965 Zeroable, Subtarget, DAG))
9968 // Use dedicated unpack instructions for masks that match their pattern.
9970 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9973 // Try to use byte rotation instructions.
9974 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9975 if (Subtarget.hasSSSE3())
9976 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9977 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9980 // If we have direct support for blends, we should lower by decomposing into
9981 // a permute. That will be faster than the domain cross.
9982 if (IsBlendSupported)
9983 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9986 // We implement this with SHUFPD which is pretty lame because it will likely
9987 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9988 // However, all the alternatives are still more cycles and newer chips don't
9989 // have this problem. It would be really nice if x86 had better shuffles here.
9990 V1 = DAG.getBitcast(MVT::v2f64, V1);
9991 V2 = DAG.getBitcast(MVT::v2f64, V2);
9992 return DAG.getBitcast(MVT::v2i64,
9993 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9996 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9998 /// This is used to disable more specialized lowerings when the shufps lowering
9999 /// will happen to be efficient.
10000 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10001 // This routine only handles 128-bit shufps.
10002 assert(Mask.size() == 4 && "Unsupported mask size!");
10003 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10004 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10005 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10006 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10008 // To lower with a single SHUFPS we need to have the low half and high half
10009 // each requiring a single input.
10010 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10012 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10018 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10020 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10021 /// It makes no assumptions about whether this is the *best* lowering, it simply
10023 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10024 ArrayRef<int> Mask, SDValue V1,
10025 SDValue V2, SelectionDAG &DAG) {
10026 SDValue LowV = V1, HighV = V2;
10027 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10029 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10031 if (NumV2Elements == 1) {
10032 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10034 // Compute the index adjacent to V2Index and in the same half by toggling
10036 int V2AdjIndex = V2Index ^ 1;
10038 if (Mask[V2AdjIndex] < 0) {
10039 // Handles all the cases where we have a single V2 element and an undef.
10040 // This will only ever happen in the high lanes because we commute the
10041 // vector otherwise.
10043 std::swap(LowV, HighV);
10044 NewMask[V2Index] -= 4;
10046 // Handle the case where the V2 element ends up adjacent to a V1 element.
10047 // To make this work, blend them together as the first step.
10048 int V1Index = V2AdjIndex;
10049 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10050 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10051 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10053 // Now proceed to reconstruct the final blend as we have the necessary
10054 // high or low half formed.
10061 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10062 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10064 } else if (NumV2Elements == 2) {
10065 if (Mask[0] < 4 && Mask[1] < 4) {
10066 // Handle the easy case where we have V1 in the low lanes and V2 in the
10070 } else if (Mask[2] < 4 && Mask[3] < 4) {
10071 // We also handle the reversed case because this utility may get called
10072 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10073 // arrange things in the right direction.
10079 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10080 // trying to place elements directly, just blend them and set up the final
10081 // shuffle to place them.
10083 // The first two blend mask elements are for V1, the second two are for
10085 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10086 Mask[2] < 4 ? Mask[2] : Mask[3],
10087 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10088 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10089 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10090 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10092 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10095 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10096 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10097 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10098 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10101 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10102 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10105 /// \brief Lower 4-lane 32-bit floating point shuffles.
10107 /// Uses instructions exclusively from the floating point unit to minimize
10108 /// domain crossing penalties, as these are sufficient to implement all v4f32
10110 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10111 const SmallBitVector &Zeroable,
10112 SDValue V1, SDValue V2,
10113 const X86Subtarget &Subtarget,
10114 SelectionDAG &DAG) {
10115 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10116 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10117 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10119 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10121 if (NumV2Elements == 0) {
10122 // Check for being able to broadcast a single element.
10123 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10124 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10127 // Use even/odd duplicate instructions for masks that match their pattern.
10128 if (Subtarget.hasSSE3()) {
10129 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10130 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10131 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10132 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10135 if (Subtarget.hasAVX()) {
10136 // If we have AVX, we can use VPERMILPS which will allow folding a load
10137 // into the shuffle.
10138 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10139 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10142 // Otherwise, use a straight shuffle of a single input vector. We pass the
10143 // input vector to both operands to simulate this with a SHUFPS.
10144 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10145 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10148 // There are special ways we can lower some single-element blends. However, we
10149 // have custom ways we can lower more complex single-element blends below that
10150 // we defer to if both this and BLENDPS fail to match, so restrict this to
10151 // when the V2 input is targeting element 0 of the mask -- that is the fast
10153 if (NumV2Elements == 1 && Mask[0] >= 4)
10154 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10155 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10158 if (Subtarget.hasSSE41()) {
10159 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10160 Zeroable, Subtarget, DAG))
10163 // Use INSERTPS if we can complete the shuffle efficiently.
10165 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10168 if (!isSingleSHUFPSMask(Mask))
10169 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10170 DL, MVT::v4f32, V1, V2, Mask, DAG))
10174 // Use low/high mov instructions.
10175 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10176 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10177 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10178 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10180 // Use dedicated unpack instructions for masks that match their pattern.
10182 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10185 // Otherwise fall back to a SHUFPS lowering strategy.
10186 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10189 /// \brief Lower 4-lane i32 vector shuffles.
10191 /// We try to handle these with integer-domain shuffles where we can, but for
10192 /// blends we use the floating point domain blend instructions.
10193 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10194 const SmallBitVector &Zeroable,
10195 SDValue V1, SDValue V2,
10196 const X86Subtarget &Subtarget,
10197 SelectionDAG &DAG) {
10198 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10199 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10200 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10202 // Whenever we can lower this as a zext, that instruction is strictly faster
10203 // than any alternative. It also allows us to fold memory operands into the
10204 // shuffle in many cases.
10205 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10206 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10209 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10211 if (NumV2Elements == 0) {
10212 // Check for being able to broadcast a single element.
10213 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10214 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10217 // Straight shuffle of a single input vector. For everything from SSE2
10218 // onward this has a single fast instruction with no scary immediates.
10219 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10220 // but we aren't actually going to use the UNPCK instruction because doing
10221 // so prevents folding a load into this instruction or making a copy.
10222 const int UnpackLoMask[] = {0, 0, 1, 1};
10223 const int UnpackHiMask[] = {2, 2, 3, 3};
10224 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10225 Mask = UnpackLoMask;
10226 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10227 Mask = UnpackHiMask;
10229 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10230 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10233 // Try to use shift instructions.
10234 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10235 Zeroable, Subtarget, DAG))
10238 // There are special ways we can lower some single-element blends.
10239 if (NumV2Elements == 1)
10240 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10241 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10244 // We have different paths for blend lowering, but they all must use the
10245 // *exact* same predicate.
10246 bool IsBlendSupported = Subtarget.hasSSE41();
10247 if (IsBlendSupported)
10248 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10249 Zeroable, Subtarget, DAG))
10252 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10256 // Use dedicated unpack instructions for masks that match their pattern.
10258 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10261 // Try to use byte rotation instructions.
10262 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10263 if (Subtarget.hasSSSE3())
10264 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10265 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10268 // Assume that a single SHUFPS is faster than an alternative sequence of
10269 // multiple instructions (even if the CPU has a domain penalty).
10270 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10271 if (!isSingleSHUFPSMask(Mask)) {
10272 // If we have direct support for blends, we should lower by decomposing into
10273 // a permute. That will be faster than the domain cross.
10274 if (IsBlendSupported)
10275 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10278 // Try to lower by permuting the inputs into an unpack instruction.
10279 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10280 DL, MVT::v4i32, V1, V2, Mask, DAG))
10284 // We implement this with SHUFPS because it can blend from two vectors.
10285 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10286 // up the inputs, bypassing domain shift penalties that we would encur if we
10287 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10289 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10290 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10291 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10292 return DAG.getBitcast(MVT::v4i32, ShufPS);
10295 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10296 /// shuffle lowering, and the most complex part.
10298 /// The lowering strategy is to try to form pairs of input lanes which are
10299 /// targeted at the same half of the final vector, and then use a dword shuffle
10300 /// to place them onto the right half, and finally unpack the paired lanes into
10301 /// their final position.
10303 /// The exact breakdown of how to form these dword pairs and align them on the
10304 /// correct sides is really tricky. See the comments within the function for
10305 /// more of the details.
10307 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10308 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10309 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10310 /// vector, form the analogous 128-bit 8-element Mask.
10311 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10312 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10313 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10314 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10315 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10317 assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
10318 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10319 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10321 SmallVector<int, 4> LoInputs;
10322 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
10323 [](int M) { return M >= 0; });
10324 std::sort(LoInputs.begin(), LoInputs.end());
10325 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10326 SmallVector<int, 4> HiInputs;
10327 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
10328 [](int M) { return M >= 0; });
10329 std::sort(HiInputs.begin(), HiInputs.end());
10330 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10332 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10333 int NumHToL = LoInputs.size() - NumLToL;
10335 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10336 int NumHToH = HiInputs.size() - NumLToH;
10337 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10338 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10339 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10340 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10342 // If we are splatting two values from one half - one to each half, then
10343 // we can shuffle that half so each is splatted to a dword, then splat those
10344 // to their respective halves.
10345 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10347 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10348 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10349 V = DAG.getNode(ShufWOp, DL, VT, V,
10350 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10351 V = DAG.getBitcast(PSHUFDVT, V);
10352 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10353 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10354 return DAG.getBitcast(VT, V);
10357 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10358 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10359 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10360 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10362 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10363 // such inputs we can swap two of the dwords across the half mark and end up
10364 // with <=2 inputs to each half in each half. Once there, we can fall through
10365 // to the generic code below. For example:
10367 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10368 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10370 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10371 // and an existing 2-into-2 on the other half. In this case we may have to
10372 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10373 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10374 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10375 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10376 // half than the one we target for fixing) will be fixed when we re-enter this
10377 // path. We will also combine away any sequence of PSHUFD instructions that
10378 // result into a single instruction. Here is an example of the tricky case:
10380 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10381 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10383 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10385 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10386 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10388 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10389 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10391 // The result is fine to be handled by the generic logic.
10392 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10393 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10394 int AOffset, int BOffset) {
10395 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10396 "Must call this with A having 3 or 1 inputs from the A half.");
10397 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10398 "Must call this with B having 1 or 3 inputs from the B half.");
10399 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10400 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10402 bool ThreeAInputs = AToAInputs.size() == 3;
10404 // Compute the index of dword with only one word among the three inputs in
10405 // a half by taking the sum of the half with three inputs and subtracting
10406 // the sum of the actual three inputs. The difference is the remaining
10408 int ADWord, BDWord;
10409 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10410 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10411 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10412 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10413 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10414 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10415 int TripleNonInputIdx =
10416 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10417 TripleDWord = TripleNonInputIdx / 2;
10419 // We use xor with one to compute the adjacent DWord to whichever one the
10421 OneInputDWord = (OneInput / 2) ^ 1;
10423 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10424 // and BToA inputs. If there is also such a problem with the BToB and AToB
10425 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10426 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10427 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10428 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10429 // Compute how many inputs will be flipped by swapping these DWords. We
10431 // to balance this to ensure we don't form a 3-1 shuffle in the other
10433 int NumFlippedAToBInputs =
10434 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10435 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10436 int NumFlippedBToBInputs =
10437 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10438 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10439 if ((NumFlippedAToBInputs == 1 &&
10440 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10441 (NumFlippedBToBInputs == 1 &&
10442 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10443 // We choose whether to fix the A half or B half based on whether that
10444 // half has zero flipped inputs. At zero, we may not be able to fix it
10445 // with that half. We also bias towards fixing the B half because that
10446 // will more commonly be the high half, and we have to bias one way.
10447 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10448 ArrayRef<int> Inputs) {
10449 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10450 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10451 // Determine whether the free index is in the flipped dword or the
10452 // unflipped dword based on where the pinned index is. We use this bit
10453 // in an xor to conditionally select the adjacent dword.
10454 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10455 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10456 if (IsFixIdxInput == IsFixFreeIdxInput)
10458 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10459 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10460 "We need to be changing the number of flipped inputs!");
10461 int PSHUFHalfMask[] = {0, 1, 2, 3};
10462 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10463 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10465 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10467 for (int &M : Mask)
10468 if (M >= 0 && M == FixIdx)
10470 else if (M >= 0 && M == FixFreeIdx)
10473 if (NumFlippedBToBInputs != 0) {
10475 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10476 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10478 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10479 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10480 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10485 int PSHUFDMask[] = {0, 1, 2, 3};
10486 PSHUFDMask[ADWord] = BDWord;
10487 PSHUFDMask[BDWord] = ADWord;
10488 V = DAG.getBitcast(
10490 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10491 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10493 // Adjust the mask to match the new locations of A and B.
10494 for (int &M : Mask)
10495 if (M >= 0 && M/2 == ADWord)
10496 M = 2 * BDWord + M % 2;
10497 else if (M >= 0 && M/2 == BDWord)
10498 M = 2 * ADWord + M % 2;
10500 // Recurse back into this routine to re-compute state now that this isn't
10501 // a 3 and 1 problem.
10502 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10505 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10506 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10507 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10508 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10510 // At this point there are at most two inputs to the low and high halves from
10511 // each half. That means the inputs can always be grouped into dwords and
10512 // those dwords can then be moved to the correct half with a dword shuffle.
10513 // We use at most one low and one high word shuffle to collect these paired
10514 // inputs into dwords, and finally a dword shuffle to place them.
10515 int PSHUFLMask[4] = {-1, -1, -1, -1};
10516 int PSHUFHMask[4] = {-1, -1, -1, -1};
10517 int PSHUFDMask[4] = {-1, -1, -1, -1};
10519 // First fix the masks for all the inputs that are staying in their
10520 // original halves. This will then dictate the targets of the cross-half
10522 auto fixInPlaceInputs =
10523 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10524 MutableArrayRef<int> SourceHalfMask,
10525 MutableArrayRef<int> HalfMask, int HalfOffset) {
10526 if (InPlaceInputs.empty())
10528 if (InPlaceInputs.size() == 1) {
10529 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10530 InPlaceInputs[0] - HalfOffset;
10531 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10534 if (IncomingInputs.empty()) {
10535 // Just fix all of the in place inputs.
10536 for (int Input : InPlaceInputs) {
10537 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10538 PSHUFDMask[Input / 2] = Input / 2;
10543 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10544 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10545 InPlaceInputs[0] - HalfOffset;
10546 // Put the second input next to the first so that they are packed into
10547 // a dword. We find the adjacent index by toggling the low bit.
10548 int AdjIndex = InPlaceInputs[0] ^ 1;
10549 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10550 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10551 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10553 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10554 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10556 // Now gather the cross-half inputs and place them into a free dword of
10557 // their target half.
10558 // FIXME: This operation could almost certainly be simplified dramatically to
10559 // look more like the 3-1 fixing operation.
10560 auto moveInputsToRightHalf = [&PSHUFDMask](
10561 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10562 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10563 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10565 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10566 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10568 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10570 int LowWord = Word & ~1;
10571 int HighWord = Word | 1;
10572 return isWordClobbered(SourceHalfMask, LowWord) ||
10573 isWordClobbered(SourceHalfMask, HighWord);
10576 if (IncomingInputs.empty())
10579 if (ExistingInputs.empty()) {
10580 // Map any dwords with inputs from them into the right half.
10581 for (int Input : IncomingInputs) {
10582 // If the source half mask maps over the inputs, turn those into
10583 // swaps and use the swapped lane.
10584 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10585 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10586 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10587 Input - SourceOffset;
10588 // We have to swap the uses in our half mask in one sweep.
10589 for (int &M : HalfMask)
10590 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10592 else if (M == Input)
10593 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10595 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10596 Input - SourceOffset &&
10597 "Previous placement doesn't match!");
10599 // Note that this correctly re-maps both when we do a swap and when
10600 // we observe the other side of the swap above. We rely on that to
10601 // avoid swapping the members of the input list directly.
10602 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10605 // Map the input's dword into the correct half.
10606 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10607 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10609 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10611 "Previous placement doesn't match!");
10614 // And just directly shift any other-half mask elements to be same-half
10615 // as we will have mirrored the dword containing the element into the
10616 // same position within that half.
10617 for (int &M : HalfMask)
10618 if (M >= SourceOffset && M < SourceOffset + 4) {
10619 M = M - SourceOffset + DestOffset;
10620 assert(M >= 0 && "This should never wrap below zero!");
10625 // Ensure we have the input in a viable dword of its current half. This
10626 // is particularly tricky because the original position may be clobbered
10627 // by inputs being moved and *staying* in that half.
10628 if (IncomingInputs.size() == 1) {
10629 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10630 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
10632 SourceHalfMask[InputFixed - SourceOffset] =
10633 IncomingInputs[0] - SourceOffset;
10634 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
10636 IncomingInputs[0] = InputFixed;
10638 } else if (IncomingInputs.size() == 2) {
10639 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
10640 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10641 // We have two non-adjacent or clobbered inputs we need to extract from
10642 // the source half. To do this, we need to map them into some adjacent
10643 // dword slot in the source mask.
10644 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
10645 IncomingInputs[1] - SourceOffset};
10647 // If there is a free slot in the source half mask adjacent to one of
10648 // the inputs, place the other input in it. We use (Index XOR 1) to
10649 // compute an adjacent index.
10650 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
10651 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
10652 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
10653 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10654 InputsFixed[1] = InputsFixed[0] ^ 1;
10655 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
10656 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
10657 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
10658 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
10659 InputsFixed[0] = InputsFixed[1] ^ 1;
10660 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
10661 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
10662 // The two inputs are in the same DWord but it is clobbered and the
10663 // adjacent DWord isn't used at all. Move both inputs to the free
10665 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
10666 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
10667 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
10668 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
10670 // The only way we hit this point is if there is no clobbering
10671 // (because there are no off-half inputs to this half) and there is no
10672 // free slot adjacent to one of the inputs. In this case, we have to
10673 // swap an input with a non-input.
10674 for (int i = 0; i < 4; ++i)
10675 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
10676 "We can't handle any clobbers here!");
10677 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
10678 "Cannot have adjacent inputs here!");
10680 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10681 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
10683 // We also have to update the final source mask in this case because
10684 // it may need to undo the above swap.
10685 for (int &M : FinalSourceHalfMask)
10686 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
10687 M = InputsFixed[1] + SourceOffset;
10688 else if (M == InputsFixed[1] + SourceOffset)
10689 M = (InputsFixed[0] ^ 1) + SourceOffset;
10691 InputsFixed[1] = InputsFixed[0] ^ 1;
10694 // Point everything at the fixed inputs.
10695 for (int &M : HalfMask)
10696 if (M == IncomingInputs[0])
10697 M = InputsFixed[0] + SourceOffset;
10698 else if (M == IncomingInputs[1])
10699 M = InputsFixed[1] + SourceOffset;
10701 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
10702 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
10705 llvm_unreachable("Unhandled input size!");
10708 // Now hoist the DWord down to the right half.
10709 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
10710 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
10711 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
10712 for (int &M : HalfMask)
10713 for (int Input : IncomingInputs)
10715 M = FreeDWord * 2 + Input % 2;
10717 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
10718 /*SourceOffset*/ 4, /*DestOffset*/ 0);
10719 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
10720 /*SourceOffset*/ 0, /*DestOffset*/ 4);
10722 // Now enact all the shuffles we've computed to move the inputs into their
10724 if (!isNoopShuffleMask(PSHUFLMask))
10725 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10726 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
10727 if (!isNoopShuffleMask(PSHUFHMask))
10728 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10729 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
10730 if (!isNoopShuffleMask(PSHUFDMask))
10731 V = DAG.getBitcast(
10733 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10734 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10736 // At this point, each half should contain all its inputs, and we can then
10737 // just shuffle them into their final position.
10738 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
10739 "Failed to lift all the high half inputs to the low mask!");
10740 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
10741 "Failed to lift all the low half inputs to the high mask!");
10743 // Do a half shuffle for the low mask.
10744 if (!isNoopShuffleMask(LoMask))
10745 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10746 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
10748 // Do a half shuffle with the high mask after shifting its values down.
10749 for (int &M : HiMask)
10752 if (!isNoopShuffleMask(HiMask))
10753 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10754 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
10759 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
10760 /// blend if only one input is used.
10761 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
10762 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10763 const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
10765 SDValue V1Mask[16];
10766 SDValue V2Mask[16];
10770 int Size = Mask.size();
10771 int Scale = 16 / Size;
10772 for (int i = 0; i < 16; ++i) {
10773 if (Mask[i / Scale] < 0) {
10774 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
10776 const int ZeroMask = 0x80;
10777 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
10779 int V2Idx = Mask[i / Scale] < Size
10781 : (Mask[i / Scale] - Size) * Scale + i % Scale;
10782 if (Zeroable[i / Scale])
10783 V1Idx = V2Idx = ZeroMask;
10784 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
10785 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
10786 V1InUse |= (ZeroMask != V1Idx);
10787 V2InUse |= (ZeroMask != V2Idx);
10792 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10793 DAG.getBitcast(MVT::v16i8, V1),
10794 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
10796 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10797 DAG.getBitcast(MVT::v16i8, V2),
10798 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
10800 // If we need shuffled inputs from both, blend the two.
10802 if (V1InUse && V2InUse)
10803 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
10805 V = V1InUse ? V1 : V2;
10807 // Cast the result back to the correct type.
10808 return DAG.getBitcast(VT, V);
10811 /// \brief Generic lowering of 8-lane i16 shuffles.
10813 /// This handles both single-input shuffles and combined shuffle/blends with
10814 /// two inputs. The single input shuffles are immediately delegated to
10815 /// a dedicated lowering routine.
10817 /// The blends are lowered in one of three fundamental ways. If there are few
10818 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
10819 /// of the input is significantly cheaper when lowered as an interleaving of
10820 /// the two inputs, try to interleave them. Otherwise, blend the low and high
10821 /// halves of the inputs separately (making them have relatively few inputs)
10822 /// and then concatenate them.
10823 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10824 const SmallBitVector &Zeroable,
10825 SDValue V1, SDValue V2,
10826 const X86Subtarget &Subtarget,
10827 SelectionDAG &DAG) {
10828 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10829 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10830 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10832 // Whenever we can lower this as a zext, that instruction is strictly faster
10833 // than any alternative.
10834 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10835 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10838 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
10840 if (NumV2Inputs == 0) {
10841 // Check for being able to broadcast a single element.
10842 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10843 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10846 // Try to use shift instructions.
10847 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
10848 Zeroable, Subtarget, DAG))
10851 // Use dedicated unpack instructions for masks that match their pattern.
10853 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10856 // Try to use byte rotation instructions.
10857 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
10858 Mask, Subtarget, DAG))
10861 // Make a copy of the mask so it can be modified.
10862 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
10863 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
10864 MutableMask, Subtarget,
10868 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
10869 "All single-input shuffles should be canonicalized to be V1-input "
10872 // Try to use shift instructions.
10873 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
10874 Zeroable, Subtarget, DAG))
10877 // See if we can use SSE4A Extraction / Insertion.
10878 if (Subtarget.hasSSE4A())
10879 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
10883 // There are special ways we can lower some single-element blends.
10884 if (NumV2Inputs == 1)
10885 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10886 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10889 // We have different paths for blend lowering, but they all must use the
10890 // *exact* same predicate.
10891 bool IsBlendSupported = Subtarget.hasSSE41();
10892 if (IsBlendSupported)
10893 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
10894 Zeroable, Subtarget, DAG))
10897 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
10901 // Use dedicated unpack instructions for masks that match their pattern.
10903 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10906 // Try to use byte rotation instructions.
10907 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10908 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10911 if (SDValue BitBlend =
10912 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10915 // Try to lower by permuting the inputs into an unpack instruction.
10916 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10920 // If we can't directly blend but can use PSHUFB, that will be better as it
10921 // can both shuffle and set up the inefficient blend.
10922 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10923 bool V1InUse, V2InUse;
10924 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
10925 Zeroable, DAG, V1InUse, V2InUse);
10928 // We can always bit-blend if we have to so the fallback strategy is to
10929 // decompose into single-input permutes and blends.
10930 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10934 /// \brief Check whether a compaction lowering can be done by dropping even
10935 /// elements and compute how many times even elements must be dropped.
10937 /// This handles shuffles which take every Nth element where N is a power of
10938 /// two. Example shuffle masks:
10940 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10941 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10942 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10943 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10944 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10945 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10947 /// Any of these lanes can of course be undef.
10949 /// This routine only supports N <= 3.
10950 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10953 /// \returns N above, or the number of times even elements must be dropped if
10954 /// there is such a number. Otherwise returns zero.
10955 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10956 bool IsSingleInput) {
10957 // The modulus for the shuffle vector entries is based on whether this is
10958 // a single input or not.
10959 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10960 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10961 "We should only be called with masks with a power-of-2 size!");
10963 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10965 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10966 // and 2^3 simultaneously. This is because we may have ambiguity with
10967 // partially undef inputs.
10968 bool ViableForN[3] = {true, true, true};
10970 for (int i = 0, e = Mask.size(); i < e; ++i) {
10971 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10976 bool IsAnyViable = false;
10977 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10978 if (ViableForN[j]) {
10979 uint64_t N = j + 1;
10981 // The shuffle mask must be equal to (i * 2^N) % M.
10982 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10983 IsAnyViable = true;
10985 ViableForN[j] = false;
10987 // Early exit if we exhaust the possible powers of two.
10992 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10996 // Return 0 as there is no viable power of two.
11000 /// \brief Generic lowering of v16i8 shuffles.
11002 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11003 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11004 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11005 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11007 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11008 const SmallBitVector &Zeroable,
11009 SDValue V1, SDValue V2,
11010 const X86Subtarget &Subtarget,
11011 SelectionDAG &DAG) {
11012 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11013 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11014 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11016 // Try to use shift instructions.
11017 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11018 Zeroable, Subtarget, DAG))
11021 // Try to use byte rotation instructions.
11022 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11023 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11026 // Try to use a zext lowering.
11027 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11028 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11031 // See if we can use SSE4A Extraction / Insertion.
11032 if (Subtarget.hasSSE4A())
11033 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11037 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11039 // For single-input shuffles, there are some nicer lowering tricks we can use.
11040 if (NumV2Elements == 0) {
11041 // Check for being able to broadcast a single element.
11042 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11043 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11046 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11047 // Notably, this handles splat and partial-splat shuffles more efficiently.
11048 // However, it only makes sense if the pre-duplication shuffle simplifies
11049 // things significantly. Currently, this means we need to be able to
11050 // express the pre-duplication shuffle as an i16 shuffle.
11052 // FIXME: We should check for other patterns which can be widened into an
11053 // i16 shuffle as well.
11054 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11055 for (int i = 0; i < 16; i += 2)
11056 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11061 auto tryToWidenViaDuplication = [&]() -> SDValue {
11062 if (!canWidenViaDuplication(Mask))
11064 SmallVector<int, 4> LoInputs;
11065 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
11066 [](int M) { return M >= 0 && M < 8; });
11067 std::sort(LoInputs.begin(), LoInputs.end());
11068 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11070 SmallVector<int, 4> HiInputs;
11071 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
11072 [](int M) { return M >= 8; });
11073 std::sort(HiInputs.begin(), HiInputs.end());
11074 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11077 bool TargetLo = LoInputs.size() >= HiInputs.size();
11078 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11079 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11081 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11082 SmallDenseMap<int, int, 8> LaneMap;
11083 for (int I : InPlaceInputs) {
11084 PreDupI16Shuffle[I/2] = I/2;
11087 int j = TargetLo ? 0 : 4, je = j + 4;
11088 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11089 // Check if j is already a shuffle of this input. This happens when
11090 // there are two adjacent bytes after we move the low one.
11091 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11092 // If we haven't yet mapped the input, search for a slot into which
11094 while (j < je && PreDupI16Shuffle[j] >= 0)
11098 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11101 // Map this input with the i16 shuffle.
11102 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11105 // Update the lane map based on the mapping we ended up with.
11106 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11108 V1 = DAG.getBitcast(
11110 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11111 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11113 // Unpack the bytes to form the i16s that will be shuffled into place.
11114 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11115 MVT::v16i8, V1, V1);
11117 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11118 for (int i = 0; i < 16; ++i)
11119 if (Mask[i] >= 0) {
11120 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11121 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11122 if (PostDupI16Shuffle[i / 2] < 0)
11123 PostDupI16Shuffle[i / 2] = MappedMask;
11125 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11126 "Conflicting entrties in the original shuffle!");
11128 return DAG.getBitcast(
11130 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11131 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11133 if (SDValue V = tryToWidenViaDuplication())
11137 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11141 // Use dedicated unpack instructions for masks that match their pattern.
11143 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11146 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11147 // with PSHUFB. It is important to do this before we attempt to generate any
11148 // blends but after all of the single-input lowerings. If the single input
11149 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11150 // want to preserve that and we can DAG combine any longer sequences into
11151 // a PSHUFB in the end. But once we start blending from multiple inputs,
11152 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11153 // and there are *very* few patterns that would actually be faster than the
11154 // PSHUFB approach because of its ability to zero lanes.
11156 // FIXME: The only exceptions to the above are blends which are exact
11157 // interleavings with direct instructions supporting them. We currently don't
11158 // handle those well here.
11159 if (Subtarget.hasSSSE3()) {
11160 bool V1InUse = false;
11161 bool V2InUse = false;
11163 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11164 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11166 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11167 // do so. This avoids using them to handle blends-with-zero which is
11168 // important as a single pshufb is significantly faster for that.
11169 if (V1InUse && V2InUse) {
11170 if (Subtarget.hasSSE41())
11171 if (SDValue Blend = lowerVectorShuffleAsBlend(
11172 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11175 // We can use an unpack to do the blending rather than an or in some
11176 // cases. Even though the or may be (very minorly) more efficient, we
11177 // preference this lowering because there are common cases where part of
11178 // the complexity of the shuffles goes away when we do the final blend as
11180 // FIXME: It might be worth trying to detect if the unpack-feeding
11181 // shuffles will both be pshufb, in which case we shouldn't bother with
11183 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11184 DL, MVT::v16i8, V1, V2, Mask, DAG))
11191 // There are special ways we can lower some single-element blends.
11192 if (NumV2Elements == 1)
11193 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11194 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11197 if (SDValue BitBlend =
11198 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11201 // Check whether a compaction lowering can be done. This handles shuffles
11202 // which take every Nth element for some even N. See the helper function for
11205 // We special case these as they can be particularly efficiently handled with
11206 // the PACKUSB instruction on x86 and they show up in common patterns of
11207 // rearranging bytes to truncate wide elements.
11208 bool IsSingleInput = V2.isUndef();
11209 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11210 // NumEvenDrops is the power of two stride of the elements. Another way of
11211 // thinking about it is that we need to drop the even elements this many
11212 // times to get the original input.
11214 // First we need to zero all the dropped bytes.
11215 assert(NumEvenDrops <= 3 &&
11216 "No support for dropping even elements more than 3 times.");
11217 // We use the mask type to pick which bytes are preserved based on how many
11218 // elements are dropped.
11219 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11220 SDValue ByteClearMask = DAG.getBitcast(
11221 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11222 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11223 if (!IsSingleInput)
11224 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11226 // Now pack things back together.
11227 V1 = DAG.getBitcast(MVT::v8i16, V1);
11228 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11229 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11230 for (int i = 1; i < NumEvenDrops; ++i) {
11231 Result = DAG.getBitcast(MVT::v8i16, Result);
11232 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11238 // Handle multi-input cases by blending single-input shuffles.
11239 if (NumV2Elements > 0)
11240 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11243 // The fallback path for single-input shuffles widens this into two v8i16
11244 // vectors with unpacks, shuffles those, and then pulls them back together
11248 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11249 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11250 for (int i = 0; i < 16; ++i)
11252 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11254 SDValue VLoHalf, VHiHalf;
11255 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11256 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11258 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11259 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11260 // Use a mask to drop the high bytes.
11261 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11262 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11263 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11265 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11266 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11268 // Squash the masks to point directly into VLoHalf.
11269 for (int &M : LoBlendMask)
11272 for (int &M : HiBlendMask)
11276 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11277 // VHiHalf so that we can blend them as i16s.
11278 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11280 VLoHalf = DAG.getBitcast(
11281 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11282 VHiHalf = DAG.getBitcast(
11283 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11286 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11287 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11289 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11292 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11294 /// This routine breaks down the specific type of 128-bit shuffle and
11295 /// dispatches to the lowering routines accordingly.
11296 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11297 MVT VT, SDValue V1, SDValue V2,
11298 const SmallBitVector &Zeroable,
11299 const X86Subtarget &Subtarget,
11300 SelectionDAG &DAG) {
11301 switch (VT.SimpleTy) {
11303 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11305 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11307 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11309 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11311 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11313 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11316 llvm_unreachable("Unimplemented!");
11320 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11322 /// This routine just extracts two subvectors, shuffles them independently, and
11323 /// then concatenates them back together. This should work effectively with all
11324 /// AVX vector shuffle types.
11325 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11326 SDValue V2, ArrayRef<int> Mask,
11327 SelectionDAG &DAG) {
11328 assert(VT.getSizeInBits() >= 256 &&
11329 "Only for 256-bit or wider vector shuffles!");
11330 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11331 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11333 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11334 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11336 int NumElements = VT.getVectorNumElements();
11337 int SplitNumElements = NumElements / 2;
11338 MVT ScalarVT = VT.getVectorElementType();
11339 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11341 // Rather than splitting build-vectors, just build two narrower build
11342 // vectors. This helps shuffling with splats and zeros.
11343 auto SplitVector = [&](SDValue V) {
11344 V = peekThroughBitcasts(V);
11346 MVT OrigVT = V.getSimpleValueType();
11347 int OrigNumElements = OrigVT.getVectorNumElements();
11348 int OrigSplitNumElements = OrigNumElements / 2;
11349 MVT OrigScalarVT = OrigVT.getVectorElementType();
11350 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11354 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11356 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11357 DAG.getIntPtrConstant(0, DL));
11358 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11359 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11362 SmallVector<SDValue, 16> LoOps, HiOps;
11363 for (int i = 0; i < OrigSplitNumElements; ++i) {
11364 LoOps.push_back(BV->getOperand(i));
11365 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11367 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11368 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11370 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11371 DAG.getBitcast(SplitVT, HiV));
11374 SDValue LoV1, HiV1, LoV2, HiV2;
11375 std::tie(LoV1, HiV1) = SplitVector(V1);
11376 std::tie(LoV2, HiV2) = SplitVector(V2);
11378 // Now create two 4-way blends of these half-width vectors.
11379 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11380 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11381 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11382 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11383 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11384 for (int i = 0; i < SplitNumElements; ++i) {
11385 int M = HalfMask[i];
11386 if (M >= NumElements) {
11387 if (M >= NumElements + SplitNumElements)
11391 V2BlendMask[i] = M - NumElements;
11392 BlendMask[i] = SplitNumElements + i;
11393 } else if (M >= 0) {
11394 if (M >= SplitNumElements)
11398 V1BlendMask[i] = M;
11403 // Because the lowering happens after all combining takes place, we need to
11404 // manually combine these blend masks as much as possible so that we create
11405 // a minimal number of high-level vector shuffle nodes.
11407 // First try just blending the halves of V1 or V2.
11408 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11409 return DAG.getUNDEF(SplitVT);
11410 if (!UseLoV2 && !UseHiV2)
11411 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11412 if (!UseLoV1 && !UseHiV1)
11413 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11415 SDValue V1Blend, V2Blend;
11416 if (UseLoV1 && UseHiV1) {
11418 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11420 // We only use half of V1 so map the usage down into the final blend mask.
11421 V1Blend = UseLoV1 ? LoV1 : HiV1;
11422 for (int i = 0; i < SplitNumElements; ++i)
11423 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11424 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11426 if (UseLoV2 && UseHiV2) {
11428 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11430 // We only use half of V2 so map the usage down into the final blend mask.
11431 V2Blend = UseLoV2 ? LoV2 : HiV2;
11432 for (int i = 0; i < SplitNumElements; ++i)
11433 if (BlendMask[i] >= SplitNumElements)
11434 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11436 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11438 SDValue Lo = HalfBlend(LoMask);
11439 SDValue Hi = HalfBlend(HiMask);
11440 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11443 /// \brief Either split a vector in halves or decompose the shuffles and the
11446 /// This is provided as a good fallback for many lowerings of non-single-input
11447 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11448 /// between splitting the shuffle into 128-bit components and stitching those
11449 /// back together vs. extracting the single-input shuffles and blending those
11451 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11452 SDValue V1, SDValue V2,
11453 ArrayRef<int> Mask,
11454 SelectionDAG &DAG) {
11455 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11456 "shuffles as it could then recurse on itself.");
11457 int Size = Mask.size();
11459 // If this can be modeled as a broadcast of two elements followed by a blend,
11460 // prefer that lowering. This is especially important because broadcasts can
11461 // often fold with memory operands.
11462 auto DoBothBroadcast = [&] {
11463 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11466 if (V2BroadcastIdx < 0)
11467 V2BroadcastIdx = M - Size;
11468 else if (M - Size != V2BroadcastIdx)
11470 } else if (M >= 0) {
11471 if (V1BroadcastIdx < 0)
11472 V1BroadcastIdx = M;
11473 else if (M != V1BroadcastIdx)
11478 if (DoBothBroadcast())
11479 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11482 // If the inputs all stem from a single 128-bit lane of each input, then we
11483 // split them rather than blending because the split will decompose to
11484 // unusually few instructions.
11485 int LaneCount = VT.getSizeInBits() / 128;
11486 int LaneSize = Size / LaneCount;
11487 SmallBitVector LaneInputs[2];
11488 LaneInputs[0].resize(LaneCount, false);
11489 LaneInputs[1].resize(LaneCount, false);
11490 for (int i = 0; i < Size; ++i)
11492 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11493 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11494 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11496 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11497 // that the decomposed single-input shuffles don't end up here.
11498 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11501 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11502 /// a permutation and blend of those lanes.
11504 /// This essentially blends the out-of-lane inputs to each lane into the lane
11505 /// from a permuted copy of the vector. This lowering strategy results in four
11506 /// instructions in the worst case for a single-input cross lane shuffle which
11507 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11508 /// of. Special cases for each particular shuffle pattern should be handled
11509 /// prior to trying this lowering.
11510 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11511 SDValue V1, SDValue V2,
11512 ArrayRef<int> Mask,
11513 SelectionDAG &DAG) {
11514 // FIXME: This should probably be generalized for 512-bit vectors as well.
11515 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11516 int Size = Mask.size();
11517 int LaneSize = Size / 2;
11519 // If there are only inputs from one 128-bit lane, splitting will in fact be
11520 // less expensive. The flags track whether the given lane contains an element
11521 // that crosses to another lane.
11522 bool LaneCrossing[2] = {false, false};
11523 for (int i = 0; i < Size; ++i)
11524 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11525 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11526 if (!LaneCrossing[0] || !LaneCrossing[1])
11527 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11529 assert(V2.isUndef() &&
11530 "This last part of this routine only works on single input shuffles");
11532 SmallVector<int, 32> FlippedBlendMask(Size);
11533 for (int i = 0; i < Size; ++i)
11534 FlippedBlendMask[i] =
11535 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11537 : Mask[i] % LaneSize +
11538 (i / LaneSize) * LaneSize + Size);
11540 // Flip the vector, and blend the results which should now be in-lane. The
11541 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11542 // 5 for the high source. The value 3 selects the high half of source 2 and
11543 // the value 2 selects the low half of source 2. We only use source 2 to
11544 // allow folding it into a memory operand.
11545 unsigned PERMMask = 3 | 2 << 4;
11546 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11547 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11548 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11551 /// \brief Handle lowering 2-lane 128-bit shuffles.
11552 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11553 SDValue V2, ArrayRef<int> Mask,
11554 const SmallBitVector &Zeroable,
11555 const X86Subtarget &Subtarget,
11556 SelectionDAG &DAG) {
11557 SmallVector<int, 4> WidenedMask;
11558 if (!canWidenShuffleElements(Mask, WidenedMask))
11561 // TODO: If minimizing size and one of the inputs is a zero vector and the
11562 // the zero vector has only one use, we could use a VPERM2X128 to save the
11563 // instruction bytes needed to explicitly generate the zero vector.
11565 // Blends are faster and handle all the non-lane-crossing cases.
11566 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11567 Zeroable, Subtarget, DAG))
11570 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11571 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11573 // If either input operand is a zero vector, use VPERM2X128 because its mask
11574 // allows us to replace the zero input with an implicit zero.
11575 if (!IsV1Zero && !IsV2Zero) {
11576 // Check for patterns which can be matched with a single insert of a 128-bit
11578 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11579 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11580 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11581 if (Subtarget.hasAVX2() && V2.isUndef())
11584 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11585 VT.getVectorNumElements() / 2);
11586 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11587 DAG.getIntPtrConstant(0, DL));
11588 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11589 OnlyUsesV1 ? V1 : V2,
11590 DAG.getIntPtrConstant(0, DL));
11591 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11595 // Otherwise form a 128-bit permutation. After accounting for undefs,
11596 // convert the 64-bit shuffle mask selection values into 128-bit
11597 // selection bits by dividing the indexes by 2 and shifting into positions
11598 // defined by a vperm2*128 instruction's immediate control byte.
11600 // The immediate permute control byte looks like this:
11601 // [1:0] - select 128 bits from sources for low half of destination
11603 // [3] - zero low half of destination
11604 // [5:4] - select 128 bits from sources for high half of destination
11606 // [7] - zero high half of destination
11608 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11609 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11611 unsigned PermMask = MaskLO | (MaskHI << 4);
11613 // If either input is a zero vector, replace it with an undef input.
11614 // Shuffle mask values < 4 are selecting elements of V1.
11615 // Shuffle mask values >= 4 are selecting elements of V2.
11616 // Adjust each half of the permute mask by clearing the half that was
11617 // selecting the zero vector and setting the zero mask bit.
11619 V1 = DAG.getUNDEF(VT);
11621 PermMask = (PermMask & 0xf0) | 0x08;
11623 PermMask = (PermMask & 0x0f) | 0x80;
11626 V2 = DAG.getUNDEF(VT);
11628 PermMask = (PermMask & 0xf0) | 0x08;
11630 PermMask = (PermMask & 0x0f) | 0x80;
11633 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
11634 DAG.getConstant(PermMask, DL, MVT::i8));
11637 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
11638 /// shuffling each lane.
11640 /// This will only succeed when the result of fixing the 128-bit lanes results
11641 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
11642 /// each 128-bit lanes. This handles many cases where we can quickly blend away
11643 /// the lane crosses early and then use simpler shuffles within each lane.
11645 /// FIXME: It might be worthwhile at some point to support this without
11646 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
11647 /// in x86 only floating point has interesting non-repeating shuffles, and even
11648 /// those are still *marginally* more expensive.
11649 static SDValue lowerVectorShuffleByMerging128BitLanes(
11650 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11651 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11652 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
11654 int Size = Mask.size();
11655 int LaneSize = 128 / VT.getScalarSizeInBits();
11656 int NumLanes = Size / LaneSize;
11657 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
11659 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
11660 // check whether the in-128-bit lane shuffles share a repeating pattern.
11661 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
11662 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
11663 for (int i = 0; i < Size; ++i) {
11667 int j = i / LaneSize;
11669 if (Lanes[j] < 0) {
11670 // First entry we've seen for this lane.
11671 Lanes[j] = Mask[i] / LaneSize;
11672 } else if (Lanes[j] != Mask[i] / LaneSize) {
11673 // This doesn't match the lane selected previously!
11677 // Check that within each lane we have a consistent shuffle mask.
11678 int k = i % LaneSize;
11679 if (InLaneMask[k] < 0) {
11680 InLaneMask[k] = Mask[i] % LaneSize;
11681 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
11682 // This doesn't fit a repeating in-lane mask.
11687 // First shuffle the lanes into place.
11688 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
11689 VT.getSizeInBits() / 64);
11690 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
11691 for (int i = 0; i < NumLanes; ++i)
11692 if (Lanes[i] >= 0) {
11693 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
11694 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
11697 V1 = DAG.getBitcast(LaneVT, V1);
11698 V2 = DAG.getBitcast(LaneVT, V2);
11699 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
11701 // Cast it back to the type we actually want.
11702 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
11704 // Now do a simple shuffle that isn't lane crossing.
11705 SmallVector<int, 8> NewMask((unsigned)Size, -1);
11706 for (int i = 0; i < Size; ++i)
11708 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
11709 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
11710 "Must not introduce lane crosses at this point!");
11712 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
11715 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
11716 /// This allows for fast cases such as subvector extraction/insertion
11717 /// or shuffling smaller vector types which can lower more efficiently.
11718 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
11719 SDValue V1, SDValue V2,
11720 ArrayRef<int> Mask,
11721 const X86Subtarget &Subtarget,
11722 SelectionDAG &DAG) {
11723 assert(VT.is256BitVector() && "Expected 256-bit vector");
11725 unsigned NumElts = VT.getVectorNumElements();
11726 unsigned HalfNumElts = NumElts / 2;
11727 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
11729 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
11730 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
11731 if (!UndefLower && !UndefUpper)
11734 // Upper half is undef and lower half is whole upper subvector.
11735 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
11737 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
11738 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11739 DAG.getIntPtrConstant(HalfNumElts, DL));
11740 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11741 DAG.getIntPtrConstant(0, DL));
11744 // Lower half is undef and upper half is whole lower subvector.
11745 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
11747 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
11748 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11749 DAG.getIntPtrConstant(0, DL));
11750 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11751 DAG.getIntPtrConstant(HalfNumElts, DL));
11754 // If the shuffle only uses two of the four halves of the input operands,
11755 // then extract them and perform the 'half' shuffle at half width.
11756 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
11757 int HalfIdx1 = -1, HalfIdx2 = -1;
11758 SmallVector<int, 8> HalfMask(HalfNumElts);
11759 unsigned Offset = UndefLower ? HalfNumElts : 0;
11760 for (unsigned i = 0; i != HalfNumElts; ++i) {
11761 int M = Mask[i + Offset];
11767 // Determine which of the 4 half vectors this element is from.
11768 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
11769 int HalfIdx = M / HalfNumElts;
11771 // Determine the element index into its half vector source.
11772 int HalfElt = M % HalfNumElts;
11774 // We can shuffle with up to 2 half vectors, set the new 'half'
11775 // shuffle mask accordingly.
11776 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
11777 HalfMask[i] = HalfElt;
11778 HalfIdx1 = HalfIdx;
11781 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
11782 HalfMask[i] = HalfElt + HalfNumElts;
11783 HalfIdx2 = HalfIdx;
11787 // Too many half vectors referenced.
11790 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
11792 // Only shuffle the halves of the inputs when useful.
11793 int NumLowerHalves =
11794 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
11795 int NumUpperHalves =
11796 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
11798 // uuuuXXXX - don't extract uppers just to insert again.
11799 if (UndefLower && NumUpperHalves != 0)
11802 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
11803 if (UndefUpper && NumUpperHalves == 2)
11806 // AVX2 - XXXXuuuu - always extract lowers.
11807 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
11808 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
11809 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11811 // AVX2 supports variable 32-bit element cross-lane shuffles.
11812 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
11813 // XXXXuuuu - don't extract lowers and uppers.
11814 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
11819 auto GetHalfVector = [&](int HalfIdx) {
11821 return DAG.getUNDEF(HalfVT);
11822 SDValue V = (HalfIdx < 2 ? V1 : V2);
11823 HalfIdx = (HalfIdx % 2) * HalfNumElts;
11824 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
11825 DAG.getIntPtrConstant(HalfIdx, DL));
11828 SDValue Half1 = GetHalfVector(HalfIdx1);
11829 SDValue Half2 = GetHalfVector(HalfIdx2);
11830 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
11831 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
11832 DAG.getIntPtrConstant(Offset, DL));
11835 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
11838 /// This returns true if the elements from a particular input are already in the
11839 /// slot required by the given mask and require no permutation.
11840 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
11841 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
11842 int Size = Mask.size();
11843 for (int i = 0; i < Size; ++i)
11844 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
11850 /// Handle case where shuffle sources are coming from the same 128-bit lane and
11851 /// every lane can be represented as the same repeating mask - allowing us to
11852 /// shuffle the sources with the repeating shuffle and then permute the result
11853 /// to the destination lanes.
11854 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
11855 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11856 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11857 int NumElts = VT.getVectorNumElements();
11858 int NumLanes = VT.getSizeInBits() / 128;
11859 int NumLaneElts = NumElts / NumLanes;
11861 // On AVX2 we may be able to just shuffle the lowest elements and then
11862 // broadcast the result.
11863 if (Subtarget.hasAVX2()) {
11864 for (unsigned BroadcastSize : {16, 32, 64}) {
11865 if (BroadcastSize <= VT.getScalarSizeInBits())
11867 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11869 // Attempt to match a repeating pattern every NumBroadcastElts,
11870 // accounting for UNDEFs but only references the lowest 128-bit
11871 // lane of the inputs.
11872 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11873 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11874 for (int j = 0; j != NumBroadcastElts; ++j) {
11875 int M = Mask[i + j];
11878 int &R = RepeatMask[j];
11879 if (0 != ((M % NumElts) / NumLaneElts))
11881 if (0 <= R && R != M)
11888 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11889 if (!FindRepeatingBroadcastMask(RepeatMask))
11892 // Shuffle the (lowest) repeated elements in place for broadcast.
11893 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11895 // Shuffle the actual broadcast.
11896 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11897 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11898 for (int j = 0; j != NumBroadcastElts; ++j)
11899 BroadcastMask[i + j] = j;
11900 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11905 // Bail if the shuffle mask doesn't cross 128-bit lanes.
11906 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
11909 // Bail if we already have a repeated lane shuffle mask.
11910 SmallVector<int, 8> RepeatedShuffleMask;
11911 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11914 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11915 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11916 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11917 int NumSubLanes = NumLanes * SubLaneScale;
11918 int NumSubLaneElts = NumLaneElts / SubLaneScale;
11920 // Check that all the sources are coming from the same lane and see if we can
11921 // form a repeating shuffle mask (local to each sub-lane). At the same time,
11922 // determine the source sub-lane for each destination sub-lane.
11923 int TopSrcSubLane = -1;
11924 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11925 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
11926 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
11927 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
11929 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
11930 // Extract the sub-lane mask, check that it all comes from the same lane
11931 // and normalize the mask entries to come from the first lane.
11933 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
11934 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11935 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
11938 int Lane = (M % NumElts) / NumLaneElts;
11939 if ((0 <= SrcLane) && (SrcLane != Lane))
11942 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
11943 SubLaneMask[Elt] = LocalM;
11946 // Whole sub-lane is UNDEF.
11950 // Attempt to match against the candidate repeated sub-lane masks.
11951 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
11952 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
11953 for (int i = 0; i != NumSubLaneElts; ++i) {
11954 if (M1[i] < 0 || M2[i] < 0)
11956 if (M1[i] != M2[i])
11962 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
11963 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
11966 // Merge the sub-lane mask into the matching repeated sub-lane mask.
11967 for (int i = 0; i != NumSubLaneElts; ++i) {
11968 int M = SubLaneMask[i];
11971 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
11972 "Unexpected mask element");
11973 RepeatedSubLaneMask[i] = M;
11976 // Track the top most source sub-lane - by setting the remaining to UNDEF
11977 // we can greatly simplify shuffle matching.
11978 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
11979 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11980 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
11984 // Bail if we failed to find a matching repeated sub-lane mask.
11985 if (Dst2SrcSubLanes[DstSubLane] < 0)
11988 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11989 "Unexpected source lane");
11991 // Create a repeating shuffle mask for the entire vector.
11992 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11993 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
11994 int Lane = SubLane / SubLaneScale;
11995 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
11996 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11997 int M = RepeatedSubLaneMask[Elt];
12000 int Idx = (SubLane * NumSubLaneElts) + Elt;
12001 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12004 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12006 // Shuffle each source sub-lane to its destination.
12007 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12008 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12009 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12010 if (SrcSubLane < 0)
12012 for (int j = 0; j != NumSubLaneElts; ++j)
12013 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12016 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12020 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12021 unsigned &ShuffleImm,
12022 ArrayRef<int> Mask) {
12023 int NumElts = VT.getVectorNumElements();
12024 assert(VT.getScalarType() == MVT::f64 &&
12025 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12026 "Unexpected data type for VSHUFPD");
12028 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12029 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12031 bool ShufpdMask = true;
12032 bool CommutableMask = true;
12033 for (int i = 0; i < NumElts; ++i) {
12034 if (Mask[i] == SM_SentinelUndef)
12038 int Val = (i & 6) + NumElts * (i & 1);
12039 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12040 if (Mask[i] < Val || Mask[i] > Val + 1)
12041 ShufpdMask = false;
12042 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12043 CommutableMask = false;
12044 ShuffleImm |= (Mask[i] % 2) << i;
12049 if (CommutableMask) {
12057 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12058 ArrayRef<int> Mask, SDValue V1,
12059 SDValue V2, SelectionDAG &DAG) {
12060 unsigned Immediate = 0;
12061 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12064 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12065 DAG.getConstant(Immediate, DL, MVT::i8));
12068 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12069 ArrayRef<int> Mask, SDValue V1,
12070 SDValue V2, SelectionDAG &DAG) {
12071 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12072 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12074 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12076 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12078 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12081 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12083 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12084 /// isn't available.
12085 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12086 const SmallBitVector &Zeroable,
12087 SDValue V1, SDValue V2,
12088 const X86Subtarget &Subtarget,
12089 SelectionDAG &DAG) {
12090 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12091 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12092 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12094 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12095 Zeroable, Subtarget, DAG))
12098 if (V2.isUndef()) {
12099 // Check for being able to broadcast a single element.
12100 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12101 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12104 // Use low duplicate instructions for masks that match their pattern.
12105 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12106 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12108 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12109 // Non-half-crossing single input shuffles can be lowered with an
12110 // interleaved permutation.
12111 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12112 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12113 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12114 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12117 // With AVX2 we have direct support for this permutation.
12118 if (Subtarget.hasAVX2())
12119 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12120 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12122 // Try to create an in-lane repeating shuffle mask and then shuffle the
12123 // the results into the target lanes.
12124 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12125 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12128 // Otherwise, fall back.
12129 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12133 // Use dedicated unpack instructions for masks that match their pattern.
12135 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12138 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12139 Zeroable, Subtarget, DAG))
12142 // Check if the blend happens to exactly fit that of SHUFPD.
12144 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12147 // Try to create an in-lane repeating shuffle mask and then shuffle the
12148 // the results into the target lanes.
12149 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12150 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12153 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12154 // shuffle. However, if we have AVX2 and either inputs are already in place,
12155 // we will be able to shuffle even across lanes the other input in a single
12156 // instruction so skip this pattern.
12157 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12158 isShuffleMaskInputInPlace(1, Mask))))
12159 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12160 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12163 // If we have AVX2 then we always want to lower with a blend because an v4 we
12164 // can fully permute the elements.
12165 if (Subtarget.hasAVX2())
12166 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12169 // Otherwise fall back on generic lowering.
12170 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12173 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12175 /// This routine is only called when we have AVX2 and thus a reasonable
12176 /// instruction set for v4i64 shuffling..
12177 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12178 const SmallBitVector &Zeroable,
12179 SDValue V1, SDValue V2,
12180 const X86Subtarget &Subtarget,
12181 SelectionDAG &DAG) {
12182 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12183 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12184 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12185 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12187 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12188 Zeroable, Subtarget, DAG))
12191 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12192 Zeroable, Subtarget, DAG))
12195 // Check for being able to broadcast a single element.
12196 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12197 Mask, Subtarget, DAG))
12200 if (V2.isUndef()) {
12201 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12202 // can use lower latency instructions that will operate on both lanes.
12203 SmallVector<int, 2> RepeatedMask;
12204 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12205 SmallVector<int, 4> PSHUFDMask;
12206 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12207 return DAG.getBitcast(
12209 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12210 DAG.getBitcast(MVT::v8i32, V1),
12211 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12214 // AVX2 provides a direct instruction for permuting a single input across
12216 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12217 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12220 // Try to use shift instructions.
12221 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12222 Zeroable, Subtarget, DAG))
12225 // If we have VLX support, we can use VALIGN.
12226 if (Subtarget.hasVLX())
12227 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12228 Mask, Subtarget, DAG))
12231 // Try to use PALIGNR.
12232 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12233 Mask, Subtarget, DAG))
12236 // Use dedicated unpack instructions for masks that match their pattern.
12238 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12241 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12242 // shuffle. However, if we have AVX2 and either inputs are already in place,
12243 // we will be able to shuffle even across lanes the other input in a single
12244 // instruction so skip this pattern.
12245 if (!isShuffleMaskInputInPlace(0, Mask) &&
12246 !isShuffleMaskInputInPlace(1, Mask))
12247 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12248 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12251 // Otherwise fall back on generic blend lowering.
12252 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12256 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12258 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12259 /// isn't available.
12260 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12261 const SmallBitVector &Zeroable,
12262 SDValue V1, SDValue V2,
12263 const X86Subtarget &Subtarget,
12264 SelectionDAG &DAG) {
12265 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12266 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12267 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12269 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12270 Zeroable, Subtarget, DAG))
12273 // Check for being able to broadcast a single element.
12274 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12275 Mask, Subtarget, DAG))
12278 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12279 // options to efficiently lower the shuffle.
12280 SmallVector<int, 4> RepeatedMask;
12281 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12282 assert(RepeatedMask.size() == 4 &&
12283 "Repeated masks must be half the mask width!");
12285 // Use even/odd duplicate instructions for masks that match their pattern.
12286 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12287 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12288 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12289 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12292 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12293 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12295 // Use dedicated unpack instructions for masks that match their pattern.
12297 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12300 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12301 // have already handled any direct blends.
12302 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12305 // Try to create an in-lane repeating shuffle mask and then shuffle the
12306 // the results into the target lanes.
12307 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12308 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12311 // If we have a single input shuffle with different shuffle patterns in the
12312 // two 128-bit lanes use the variable mask to VPERMILPS.
12313 if (V2.isUndef()) {
12314 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12315 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12316 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12318 if (Subtarget.hasAVX2())
12319 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12321 // Otherwise, fall back.
12322 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12326 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12328 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12329 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12332 // If we have AVX2 then we always want to lower with a blend because at v8 we
12333 // can fully permute the elements.
12334 if (Subtarget.hasAVX2())
12335 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12338 // Otherwise fall back on generic lowering.
12339 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12342 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12344 /// This routine is only called when we have AVX2 and thus a reasonable
12345 /// instruction set for v8i32 shuffling..
12346 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12347 const SmallBitVector &Zeroable,
12348 SDValue V1, SDValue V2,
12349 const X86Subtarget &Subtarget,
12350 SelectionDAG &DAG) {
12351 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12352 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12353 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12354 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12356 // Whenever we can lower this as a zext, that instruction is strictly faster
12357 // than any alternative. It also allows us to fold memory operands into the
12358 // shuffle in many cases.
12359 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12360 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12363 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12364 Zeroable, Subtarget, DAG))
12367 // Check for being able to broadcast a single element.
12368 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12369 Mask, Subtarget, DAG))
12372 // If the shuffle mask is repeated in each 128-bit lane we can use more
12373 // efficient instructions that mirror the shuffles across the two 128-bit
12375 SmallVector<int, 4> RepeatedMask;
12376 bool Is128BitLaneRepeatedShuffle =
12377 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12378 if (Is128BitLaneRepeatedShuffle) {
12379 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12381 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12382 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12384 // Use dedicated unpack instructions for masks that match their pattern.
12386 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12390 // Try to use shift instructions.
12391 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12392 Zeroable, Subtarget, DAG))
12395 // If we have VLX support, we can use VALIGN.
12396 if (Subtarget.hasVLX())
12397 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12398 Mask, Subtarget, DAG))
12401 // Try to use byte rotation instructions.
12402 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12403 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12406 // Try to create an in-lane repeating shuffle mask and then shuffle the
12407 // results into the target lanes.
12408 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12409 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12412 // If the shuffle patterns aren't repeated but it is a single input, directly
12413 // generate a cross-lane VPERMD instruction.
12414 if (V2.isUndef()) {
12415 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12416 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12419 // Assume that a single SHUFPS is faster than an alternative sequence of
12420 // multiple instructions (even if the CPU has a domain penalty).
12421 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12422 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12423 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12424 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12425 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12426 CastV1, CastV2, DAG);
12427 return DAG.getBitcast(MVT::v8i32, ShufPS);
12430 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12432 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12433 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12436 // Otherwise fall back on generic blend lowering.
12437 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12441 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12443 /// This routine is only called when we have AVX2 and thus a reasonable
12444 /// instruction set for v16i16 shuffling..
12445 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12446 const SmallBitVector &Zeroable,
12447 SDValue V1, SDValue V2,
12448 const X86Subtarget &Subtarget,
12449 SelectionDAG &DAG) {
12450 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12451 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12452 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12453 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12455 // Whenever we can lower this as a zext, that instruction is strictly faster
12456 // than any alternative. It also allows us to fold memory operands into the
12457 // shuffle in many cases.
12458 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12459 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12462 // Check for being able to broadcast a single element.
12463 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12464 Mask, Subtarget, DAG))
12467 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12468 Zeroable, Subtarget, DAG))
12471 // Use dedicated unpack instructions for masks that match their pattern.
12473 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12476 // Try to use shift instructions.
12477 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12478 Zeroable, Subtarget, DAG))
12481 // Try to use byte rotation instructions.
12482 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12483 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12486 // Try to create an in-lane repeating shuffle mask and then shuffle the
12487 // the results into the target lanes.
12488 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12489 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12492 if (V2.isUndef()) {
12493 // There are no generalized cross-lane shuffle operations available on i16
12495 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12496 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12499 SmallVector<int, 8> RepeatedMask;
12500 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12501 // As this is a single-input shuffle, the repeated mask should be
12502 // a strictly valid v8i16 mask that we can pass through to the v8i16
12503 // lowering to handle even the v16 case.
12504 return lowerV8I16GeneralSingleInputVectorShuffle(
12505 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12509 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12510 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12513 // AVX512BWVL can lower to VPERMW.
12514 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12515 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12517 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12519 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12520 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12523 // Otherwise fall back on generic lowering.
12524 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12527 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12529 /// This routine is only called when we have AVX2 and thus a reasonable
12530 /// instruction set for v32i8 shuffling..
12531 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12532 const SmallBitVector &Zeroable,
12533 SDValue V1, SDValue V2,
12534 const X86Subtarget &Subtarget,
12535 SelectionDAG &DAG) {
12536 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12537 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12538 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12539 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12541 // Whenever we can lower this as a zext, that instruction is strictly faster
12542 // than any alternative. It also allows us to fold memory operands into the
12543 // shuffle in many cases.
12544 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12545 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12548 // Check for being able to broadcast a single element.
12549 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12550 Mask, Subtarget, DAG))
12553 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12554 Zeroable, Subtarget, DAG))
12557 // Use dedicated unpack instructions for masks that match their pattern.
12559 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12562 // Try to use shift instructions.
12563 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12564 Zeroable, Subtarget, DAG))
12567 // Try to use byte rotation instructions.
12568 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12569 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12572 // Try to create an in-lane repeating shuffle mask and then shuffle the
12573 // the results into the target lanes.
12574 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12575 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12578 // There are no generalized cross-lane shuffle operations available on i8
12580 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12581 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
12584 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12585 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12588 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12590 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12591 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12594 // Otherwise fall back on generic lowering.
12595 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
12598 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
12600 /// This routine either breaks down the specific type of a 256-bit x86 vector
12601 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
12602 /// together based on the available instructions.
12603 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12604 MVT VT, SDValue V1, SDValue V2,
12605 const SmallBitVector &Zeroable,
12606 const X86Subtarget &Subtarget,
12607 SelectionDAG &DAG) {
12608 // If we have a single input to the zero element, insert that into V1 if we
12609 // can do so cheaply.
12610 int NumElts = VT.getVectorNumElements();
12611 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12613 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12614 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12615 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12618 // Handle special cases where the lower or upper half is UNDEF.
12620 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
12623 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
12624 // can check for those subtargets here and avoid much of the subtarget
12625 // querying in the per-vector-type lowering routines. With AVX1 we have
12626 // essentially *zero* ability to manipulate a 256-bit vector with integer
12627 // types. Since we'll use floating point types there eventually, just
12628 // immediately cast everything to a float and operate entirely in that domain.
12629 if (VT.isInteger() && !Subtarget.hasAVX2()) {
12630 int ElementBits = VT.getScalarSizeInBits();
12631 if (ElementBits < 32) {
12632 // No floating point type available, if we can't use the bit operations
12633 // for masking/blending then decompose into 128-bit vectors.
12635 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
12637 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12639 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12642 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
12643 VT.getVectorNumElements());
12644 V1 = DAG.getBitcast(FpVT, V1);
12645 V2 = DAG.getBitcast(FpVT, V2);
12646 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
12649 switch (VT.SimpleTy) {
12651 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12653 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12655 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12657 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12659 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12661 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12664 llvm_unreachable("Not a valid 256-bit x86 vector type!");
12668 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
12669 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
12670 ArrayRef<int> Mask, SDValue V1,
12671 SDValue V2, SelectionDAG &DAG) {
12672 assert(VT.getScalarSizeInBits() == 64 &&
12673 "Unexpected element type size for 128bit shuffle.");
12675 // To handle 256 bit vector requires VLX and most probably
12676 // function lowerV2X128VectorShuffle() is better solution.
12677 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
12679 SmallVector<int, 4> WidenedMask;
12680 if (!canWidenShuffleElements(Mask, WidenedMask))
12683 // Check for patterns which can be matched with a single insert of a 256-bit
12685 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
12686 {0, 1, 2, 3, 0, 1, 2, 3});
12687 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
12688 {0, 1, 2, 3, 8, 9, 10, 11})) {
12689 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
12690 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12691 DAG.getIntPtrConstant(0, DL));
12692 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12693 OnlyUsesV1 ? V1 : V2,
12694 DAG.getIntPtrConstant(0, DL));
12695 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12698 assert(WidenedMask.size() == 4);
12700 // See if this is an insertion of the lower 128-bits of V2 into V1.
12701 bool IsInsert = true;
12703 for (int i = 0; i < 4; ++i) {
12704 assert(WidenedMask[i] >= -1);
12705 if (WidenedMask[i] < 0)
12708 // Make sure all V1 subvectors are in place.
12709 if (WidenedMask[i] < 4) {
12710 if (WidenedMask[i] != i) {
12715 // Make sure we only have a single V2 index and its the lowest 128-bits.
12716 if (V2Index >= 0 || WidenedMask[i] != 4) {
12723 if (IsInsert && V2Index >= 0) {
12724 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12725 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
12726 DAG.getIntPtrConstant(0, DL));
12727 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
12730 // Try to lower to to vshuf64x2/vshuf32x4.
12731 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12732 unsigned PermMask = 0;
12733 // Insure elements came from the same Op.
12734 for (int i = 0; i < 4; ++i) {
12735 assert(WidenedMask[i] >= -1);
12736 if (WidenedMask[i] < 0)
12739 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
12740 unsigned OpIndex = i / 2;
12741 if (Ops[OpIndex].isUndef())
12743 else if (Ops[OpIndex] != Op)
12746 // Convert the 128-bit shuffle mask selection values into 128-bit selection
12747 // bits defined by a vshuf64x2 instruction's immediate control byte.
12748 PermMask |= (WidenedMask[i] % 4) << (i * 2);
12751 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
12752 DAG.getConstant(PermMask, DL, MVT::i8));
12755 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
12756 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12757 SDValue V1, SDValue V2,
12758 const X86Subtarget &Subtarget,
12759 SelectionDAG &DAG) {
12760 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12761 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12762 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12764 if (V2.isUndef()) {
12765 // Use low duplicate instructions for masks that match their pattern.
12766 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
12767 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
12769 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
12770 // Non-half-crossing single input shuffles can be lowered with an
12771 // interleaved permutation.
12772 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12773 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
12774 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
12775 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
12776 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
12777 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12780 SmallVector<int, 4> RepeatedMask;
12781 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
12782 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
12783 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12786 if (SDValue Shuf128 =
12787 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
12790 if (SDValue Unpck =
12791 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
12794 // Check if the blend happens to exactly fit that of SHUFPD.
12796 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
12799 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
12802 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
12803 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
12804 SDValue V1, SDValue V2,
12805 const X86Subtarget &Subtarget,
12806 SelectionDAG &DAG) {
12807 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12808 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12809 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12811 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12812 // options to efficiently lower the shuffle.
12813 SmallVector<int, 4> RepeatedMask;
12814 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
12815 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12817 // Use even/odd duplicate instructions for masks that match their pattern.
12818 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12819 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
12820 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12821 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
12824 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
12825 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12827 // Use dedicated unpack instructions for masks that match their pattern.
12828 if (SDValue Unpck =
12829 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
12832 // Otherwise, fall back to a SHUFPS sequence.
12833 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
12836 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
12839 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
12840 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12841 const SmallBitVector &Zeroable,
12842 SDValue V1, SDValue V2,
12843 const X86Subtarget &Subtarget,
12844 SelectionDAG &DAG) {
12845 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12846 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12847 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12849 if (SDValue Shuf128 =
12850 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
12853 if (V2.isUndef()) {
12854 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12855 // can use lower latency instructions that will operate on all four
12857 SmallVector<int, 2> Repeated128Mask;
12858 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
12859 SmallVector<int, 4> PSHUFDMask;
12860 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
12861 return DAG.getBitcast(
12863 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
12864 DAG.getBitcast(MVT::v16i32, V1),
12865 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12868 SmallVector<int, 4> Repeated256Mask;
12869 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
12870 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
12871 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
12874 // Try to use shift instructions.
12875 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
12876 Zeroable, Subtarget, DAG))
12879 // Try to use VALIGN.
12880 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
12881 Mask, Subtarget, DAG))
12884 // Try to use PALIGNR.
12885 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
12886 Mask, Subtarget, DAG))
12889 if (SDValue Unpck =
12890 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
12893 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
12896 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
12897 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12898 const SmallBitVector &Zeroable,
12899 SDValue V1, SDValue V2,
12900 const X86Subtarget &Subtarget,
12901 SelectionDAG &DAG) {
12902 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12903 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12904 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12906 // Whenever we can lower this as a zext, that instruction is strictly faster
12907 // than any alternative. It also allows us to fold memory operands into the
12908 // shuffle in many cases.
12909 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12910 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12913 // If the shuffle mask is repeated in each 128-bit lane we can use more
12914 // efficient instructions that mirror the shuffles across the four 128-bit
12916 SmallVector<int, 4> RepeatedMask;
12917 bool Is128BitLaneRepeatedShuffle =
12918 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
12919 if (Is128BitLaneRepeatedShuffle) {
12920 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12922 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
12923 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12925 // Use dedicated unpack instructions for masks that match their pattern.
12927 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
12931 // Try to use shift instructions.
12932 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
12933 Zeroable, Subtarget, DAG))
12936 // Try to use VALIGN.
12937 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
12938 Mask, Subtarget, DAG))
12941 // Try to use byte rotation instructions.
12942 if (Subtarget.hasBWI())
12943 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12944 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
12947 // Assume that a single SHUFPS is faster than using a permv shuffle.
12948 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12949 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12950 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
12951 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
12952 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
12953 CastV1, CastV2, DAG);
12954 return DAG.getBitcast(MVT::v16i32, ShufPS);
12957 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
12960 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
12961 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12962 const SmallBitVector &Zeroable,
12963 SDValue V1, SDValue V2,
12964 const X86Subtarget &Subtarget,
12965 SelectionDAG &DAG) {
12966 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12967 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12968 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12969 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
12971 // Whenever we can lower this as a zext, that instruction is strictly faster
12972 // than any alternative. It also allows us to fold memory operands into the
12973 // shuffle in many cases.
12974 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12975 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12978 // Use dedicated unpack instructions for masks that match their pattern.
12980 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
12983 // Try to use shift instructions.
12984 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
12985 Zeroable, Subtarget, DAG))
12988 // Try to use byte rotation instructions.
12989 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12990 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
12993 if (V2.isUndef()) {
12994 SmallVector<int, 8> RepeatedMask;
12995 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
12996 // As this is a single-input shuffle, the repeated mask should be
12997 // a strictly valid v8i16 mask that we can pass through to the v8i16
12998 // lowering to handle even the v32 case.
12999 return lowerV8I16GeneralSingleInputVectorShuffle(
13000 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13004 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13007 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13008 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13009 const SmallBitVector &Zeroable,
13010 SDValue V1, SDValue V2,
13011 const X86Subtarget &Subtarget,
13012 SelectionDAG &DAG) {
13013 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13014 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13015 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13016 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13018 // Whenever we can lower this as a zext, that instruction is strictly faster
13019 // than any alternative. It also allows us to fold memory operands into the
13020 // shuffle in many cases.
13021 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13022 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13025 // Use dedicated unpack instructions for masks that match their pattern.
13027 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13030 // Try to use shift instructions.
13031 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13032 Zeroable, Subtarget, DAG))
13035 // Try to use byte rotation instructions.
13036 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13037 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13040 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13041 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13044 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13045 if (Subtarget.hasVBMI())
13046 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13048 // Try to create an in-lane repeating shuffle mask and then shuffle the
13049 // the results into the target lanes.
13050 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13051 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13054 // FIXME: Implement direct support for this type!
13055 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13058 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13060 /// This routine either breaks down the specific type of a 512-bit x86 vector
13061 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13062 /// together based on the available instructions.
13063 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13064 MVT VT, SDValue V1, SDValue V2,
13065 const SmallBitVector &Zeroable,
13066 const X86Subtarget &Subtarget,
13067 SelectionDAG &DAG) {
13068 assert(Subtarget.hasAVX512() &&
13069 "Cannot lower 512-bit vectors w/ basic ISA!");
13071 // If we have a single input to the zero element, insert that into V1 if we
13072 // can do so cheaply.
13073 int NumElts = Mask.size();
13074 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13076 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13077 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13078 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13081 // Check for being able to broadcast a single element.
13082 if (SDValue Broadcast =
13083 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13086 // Dispatch to each element type for lowering. If we don't have support for
13087 // specific element type shuffles at 512 bits, immediately split them and
13088 // lower them. Each lowering routine of a given type is allowed to assume that
13089 // the requisite ISA extensions for that element type are available.
13090 switch (VT.SimpleTy) {
13092 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13094 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13096 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13098 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13100 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13102 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13105 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13109 // Lower vXi1 vector shuffles.
13110 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13111 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13112 // vector, shuffle and then truncate it back.
13113 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13114 MVT VT, SDValue V1, SDValue V2,
13115 const X86Subtarget &Subtarget,
13116 SelectionDAG &DAG) {
13117 assert(Subtarget.hasAVX512() &&
13118 "Cannot lower 512-bit vectors w/o basic ISA!");
13120 switch (VT.SimpleTy) {
13122 llvm_unreachable("Expected a vector of i1 elements");
13124 ExtVT = MVT::v2i64;
13127 ExtVT = MVT::v4i32;
13130 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13133 ExtVT = MVT::v16i32;
13136 ExtVT = MVT::v32i16;
13139 ExtVT = MVT::v64i8;
13143 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13144 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13145 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13146 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13148 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13151 V2 = DAG.getUNDEF(ExtVT);
13152 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13153 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13154 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13155 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13157 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13159 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13160 // i1 was sign extended we can use X86ISD::CVT2MASK.
13161 int NumElems = VT.getVectorNumElements();
13162 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13163 (Subtarget.hasDQI() && (NumElems < 32)))
13164 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13166 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13169 /// Helper function that returns true if the shuffle mask should be
13170 /// commuted to improve canonicalization.
13171 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13172 int NumElements = Mask.size();
13174 int NumV1Elements = 0, NumV2Elements = 0;
13178 else if (M < NumElements)
13183 // Commute the shuffle as needed such that more elements come from V1 than
13184 // V2. This allows us to match the shuffle pattern strictly on how many
13185 // elements come from V1 without handling the symmetric cases.
13186 if (NumV2Elements > NumV1Elements)
13189 assert(NumV1Elements > 0 && "No V1 indices");
13191 if (NumV2Elements == 0)
13194 // When the number of V1 and V2 elements are the same, try to minimize the
13195 // number of uses of V2 in the low half of the vector. When that is tied,
13196 // ensure that the sum of indices for V1 is equal to or lower than the sum
13197 // indices for V2. When those are equal, try to ensure that the number of odd
13198 // indices for V1 is lower than the number of odd indices for V2.
13199 if (NumV1Elements == NumV2Elements) {
13200 int LowV1Elements = 0, LowV2Elements = 0;
13201 for (int M : Mask.slice(0, NumElements / 2))
13202 if (M >= NumElements)
13206 if (LowV2Elements > LowV1Elements)
13208 if (LowV2Elements == LowV1Elements) {
13209 int SumV1Indices = 0, SumV2Indices = 0;
13210 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13211 if (Mask[i] >= NumElements)
13213 else if (Mask[i] >= 0)
13215 if (SumV2Indices < SumV1Indices)
13217 if (SumV2Indices == SumV1Indices) {
13218 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13219 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13220 if (Mask[i] >= NumElements)
13221 NumV2OddIndices += i % 2;
13222 else if (Mask[i] >= 0)
13223 NumV1OddIndices += i % 2;
13224 if (NumV2OddIndices < NumV1OddIndices)
13233 /// \brief Top-level lowering for x86 vector shuffles.
13235 /// This handles decomposition, canonicalization, and lowering of all x86
13236 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13237 /// above in helper routines. The canonicalization attempts to widen shuffles
13238 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13239 /// s.t. only one of the two inputs needs to be tested, etc.
13240 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13241 SelectionDAG &DAG) {
13242 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13243 ArrayRef<int> Mask = SVOp->getMask();
13244 SDValue V1 = Op.getOperand(0);
13245 SDValue V2 = Op.getOperand(1);
13246 MVT VT = Op.getSimpleValueType();
13247 int NumElements = VT.getVectorNumElements();
13249 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13251 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13252 "Can't lower MMX shuffles");
13254 bool V1IsUndef = V1.isUndef();
13255 bool V2IsUndef = V2.isUndef();
13256 if (V1IsUndef && V2IsUndef)
13257 return DAG.getUNDEF(VT);
13259 // When we create a shuffle node we put the UNDEF node to second operand,
13260 // but in some cases the first operand may be transformed to UNDEF.
13261 // In this case we should just commute the node.
13263 return DAG.getCommutedVectorShuffle(*SVOp);
13265 // Check for non-undef masks pointing at an undef vector and make the masks
13266 // undef as well. This makes it easier to match the shuffle based solely on
13270 if (M >= NumElements) {
13271 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13272 for (int &M : NewMask)
13273 if (M >= NumElements)
13275 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13278 // Check for illegal shuffle mask element index values.
13279 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13280 assert(llvm::all_of(Mask,
13281 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13282 "Out of bounds shuffle index");
13284 // We actually see shuffles that are entirely re-arrangements of a set of
13285 // zero inputs. This mostly happens while decomposing complex shuffles into
13286 // simple ones. Directly lower these as a buildvector of zeros.
13287 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13288 if (Zeroable.all())
13289 return getZeroVector(VT, Subtarget, DAG, DL);
13291 // Try to collapse shuffles into using a vector type with fewer elements but
13292 // wider element types. We cap this to not form integers or floating point
13293 // elements wider than 64 bits, but it might be interesting to form i128
13294 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13295 SmallVector<int, 16> WidenedMask;
13296 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13297 canWidenShuffleElements(Mask, WidenedMask)) {
13298 MVT NewEltVT = VT.isFloatingPoint()
13299 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13300 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13301 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13302 // Make sure that the new vector type is legal. For example, v2f64 isn't
13304 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13305 V1 = DAG.getBitcast(NewVT, V1);
13306 V2 = DAG.getBitcast(NewVT, V2);
13307 return DAG.getBitcast(
13308 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13312 // Commute the shuffle if it will improve canonicalization.
13313 if (canonicalizeShuffleMaskWithCommute(Mask))
13314 return DAG.getCommutedVectorShuffle(*SVOp);
13316 // For each vector width, delegate to a specialized lowering routine.
13317 if (VT.is128BitVector())
13318 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13321 if (VT.is256BitVector())
13322 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13325 if (VT.is512BitVector())
13326 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13330 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13332 llvm_unreachable("Unimplemented!");
13335 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13336 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13337 const X86Subtarget &Subtarget,
13338 SelectionDAG &DAG) {
13339 SDValue Cond = Op.getOperand(0);
13340 SDValue LHS = Op.getOperand(1);
13341 SDValue RHS = Op.getOperand(2);
13343 MVT VT = Op.getSimpleValueType();
13345 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13347 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13349 // Only non-legal VSELECTs reach this lowering, convert those into generic
13350 // shuffles and re-use the shuffle lowering path for blends.
13351 SmallVector<int, 32> Mask;
13352 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13353 SDValue CondElt = CondBV->getOperand(i);
13355 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13358 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13361 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13362 // A vselect where all conditions and data are constants can be optimized into
13363 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13364 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13365 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13366 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13369 // Try to lower this to a blend-style vector shuffle. This can handle all
13370 // constant condition cases.
13371 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13374 // Variable blends are only legal from SSE4.1 onward.
13375 if (!Subtarget.hasSSE41())
13378 // Only some types will be legal on some subtargets. If we can emit a legal
13379 // VSELECT-matching blend, return Op, and but if we need to expand, return
13381 switch (Op.getSimpleValueType().SimpleTy) {
13383 // Most of the vector types have blends past SSE4.1.
13387 // The byte blends for AVX vectors were introduced only in AVX2.
13388 if (Subtarget.hasAVX2())
13395 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13396 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13399 // FIXME: We should custom lower this by fixing the condition and using i8
13405 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13406 MVT VT = Op.getSimpleValueType();
13409 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13412 if (VT.getSizeInBits() == 8) {
13413 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13414 Op.getOperand(0), Op.getOperand(1));
13415 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13416 DAG.getValueType(VT));
13417 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13420 if (VT == MVT::f32) {
13421 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13422 // the result back to FR32 register. It's only worth matching if the
13423 // result has a single use which is a store or a bitcast to i32. And in
13424 // the case of a store, it's not worth it if the index is a constant 0,
13425 // because a MOVSSmr can be used instead, which is smaller and faster.
13426 if (!Op.hasOneUse())
13428 SDNode *User = *Op.getNode()->use_begin();
13429 if ((User->getOpcode() != ISD::STORE ||
13430 isNullConstant(Op.getOperand(1))) &&
13431 (User->getOpcode() != ISD::BITCAST ||
13432 User->getValueType(0) != MVT::i32))
13434 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13435 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13437 return DAG.getBitcast(MVT::f32, Extract);
13440 if (VT == MVT::i32 || VT == MVT::i64) {
13441 // ExtractPS/pextrq works with constant index.
13442 if (isa<ConstantSDNode>(Op.getOperand(1)))
13449 /// Extract one bit from mask vector, like v16i1 or v8i1.
13450 /// AVX-512 feature.
13452 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13453 SDValue Vec = Op.getOperand(0);
13455 MVT VecVT = Vec.getSimpleValueType();
13456 SDValue Idx = Op.getOperand(1);
13457 MVT EltVT = Op.getSimpleValueType();
13459 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13460 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13461 "Unexpected vector type in ExtractBitFromMaskVector");
13463 // variable index can't be handled in mask registers,
13464 // extend vector to VR512
13465 if (!isa<ConstantSDNode>(Idx)) {
13466 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13467 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13468 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13469 ExtVT.getVectorElementType(), Ext, Idx);
13470 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13473 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13474 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13475 (VecVT.getVectorNumElements() < 8)) {
13476 // Use kshiftlw/rw instruction.
13477 VecVT = MVT::v16i1;
13478 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13479 DAG.getUNDEF(VecVT),
13481 DAG.getIntPtrConstant(0, dl));
13483 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13484 if (MaxSift - IdxVal)
13485 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13486 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13487 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13488 DAG.getConstant(MaxSift, dl, MVT::i8));
13489 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13490 DAG.getIntPtrConstant(0, dl));
13494 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13495 SelectionDAG &DAG) const {
13497 SDValue Vec = Op.getOperand(0);
13498 MVT VecVT = Vec.getSimpleValueType();
13499 SDValue Idx = Op.getOperand(1);
13501 if (Op.getSimpleValueType() == MVT::i1)
13502 return ExtractBitFromMaskVector(Op, DAG);
13504 if (!isa<ConstantSDNode>(Idx)) {
13505 if (VecVT.is512BitVector() ||
13506 (VecVT.is256BitVector() && Subtarget.hasInt256() &&
13507 VecVT.getScalarSizeInBits() == 32)) {
13510 MVT::getIntegerVT(VecVT.getScalarSizeInBits());
13511 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13512 MaskEltVT.getSizeInBits());
13514 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13515 auto PtrVT = getPointerTy(DAG.getDataLayout());
13516 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13517 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
13518 DAG.getConstant(0, dl, PtrVT));
13519 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13520 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
13521 DAG.getConstant(0, dl, PtrVT));
13526 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13528 // If this is a 256-bit vector result, first extract the 128-bit vector and
13529 // then extract the element from the 128-bit vector.
13530 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13531 // Get the 128-bit vector.
13532 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
13533 MVT EltVT = VecVT.getVectorElementType();
13535 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13536 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
13538 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
13539 // this can be done with a mask.
13540 IdxVal &= ElemsPerChunk - 1;
13541 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13542 DAG.getConstant(IdxVal, dl, MVT::i32));
13545 assert(VecVT.is128BitVector() && "Unexpected vector length");
13547 MVT VT = Op.getSimpleValueType();
13549 if (VT.getSizeInBits() == 16) {
13550 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
13551 // we're going to zero extend the register or fold the store (SSE41 only).
13552 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
13553 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
13554 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13555 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13556 DAG.getBitcast(MVT::v4i32, Vec), Idx));
13558 // Transform it so it match pextrw which produces a 32-bit result.
13559 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13560 Op.getOperand(0), Op.getOperand(1));
13561 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13562 DAG.getValueType(VT));
13563 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13566 if (Subtarget.hasSSE41())
13567 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
13570 // TODO: handle v16i8.
13572 if (VT.getSizeInBits() == 32) {
13576 // SHUFPS the element to the lowest double word, then movss.
13577 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
13578 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13579 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13580 DAG.getIntPtrConstant(0, dl));
13583 if (VT.getSizeInBits() == 64) {
13584 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13585 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13586 // to match extract_elt for f64.
13590 // UNPCKHPD the element to the lowest double word, then movsd.
13591 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13592 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13593 int Mask[2] = { 1, -1 };
13594 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13595 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13596 DAG.getIntPtrConstant(0, dl));
13602 /// Insert one bit to mask vector, like v16i1 or v8i1.
13603 /// AVX-512 feature.
13605 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13607 SDValue Vec = Op.getOperand(0);
13608 SDValue Elt = Op.getOperand(1);
13609 SDValue Idx = Op.getOperand(2);
13610 MVT VecVT = Vec.getSimpleValueType();
13612 if (!isa<ConstantSDNode>(Idx)) {
13613 // Non constant index. Extend source and destination,
13614 // insert element and then truncate the result.
13615 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13616 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13617 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13618 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13619 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13620 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13623 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13624 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13625 unsigned NumElems = VecVT.getVectorNumElements();
13627 if(Vec.isUndef()) {
13629 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13630 DAG.getConstant(IdxVal, dl, MVT::i8));
13634 // Insertion of one bit into first or last position
13635 // can be done with two SHIFTs + OR.
13636 if (IdxVal == 0 ) {
13637 // EltInVec already at correct index and other bits are 0.
13638 // Clean the first bit in source vector.
13639 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13640 DAG.getConstant(1 , dl, MVT::i8));
13641 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13642 DAG.getConstant(1, dl, MVT::i8));
13644 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13646 if (IdxVal == NumElems -1) {
13647 // Move the bit to the last position inside the vector.
13648 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13649 DAG.getConstant(IdxVal, dl, MVT::i8));
13650 // Clean the last bit in the source vector.
13651 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13652 DAG.getConstant(1, dl, MVT::i8));
13653 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13654 DAG.getConstant(1 , dl, MVT::i8));
13656 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13659 // Use shuffle to insert element.
13660 SmallVector<int, 64> MaskVec(NumElems);
13661 for (unsigned i = 0; i != NumElems; ++i)
13662 MaskVec[i] = (i == IdxVal) ? NumElems : i;
13664 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
13667 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13668 SelectionDAG &DAG) const {
13669 MVT VT = Op.getSimpleValueType();
13670 MVT EltVT = VT.getVectorElementType();
13671 unsigned NumElts = VT.getVectorNumElements();
13673 if (EltVT == MVT::i1)
13674 return InsertBitToMaskVector(Op, DAG);
13677 SDValue N0 = Op.getOperand(0);
13678 SDValue N1 = Op.getOperand(1);
13679 SDValue N2 = Op.getOperand(2);
13680 if (!isa<ConstantSDNode>(N2))
13682 auto *N2C = cast<ConstantSDNode>(N2);
13683 unsigned IdxVal = N2C->getZExtValue();
13685 // If we are clearing out a element, we do this more efficiently with a
13686 // blend shuffle than a costly integer insertion.
13687 // TODO: would other rematerializable values (e.g. allbits) benefit as well?
13688 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
13689 // be beneficial if we are inserting several zeros and can combine the masks.
13690 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
13691 SmallVector<int, 8> ClearMask;
13692 for (unsigned i = 0; i != NumElts; ++i)
13693 ClearMask.push_back(i == IdxVal ? i + NumElts : i);
13694 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
13695 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
13698 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13699 // into that, and then insert the subvector back into the result.
13700 if (VT.is256BitVector() || VT.is512BitVector()) {
13701 // With a 256-bit vector, we can insert into the zero element efficiently
13702 // using a blend if we have AVX or AVX2 and the right data type.
13703 if (VT.is256BitVector() && IdxVal == 0) {
13704 // TODO: It is worthwhile to cast integer to floating point and back
13705 // and incur a domain crossing penalty if that's what we'll end up
13706 // doing anyway after extracting to a 128-bit vector.
13707 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13708 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
13709 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
13710 N2 = DAG.getIntPtrConstant(1, dl);
13711 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
13715 // Get the desired 128-bit vector chunk.
13716 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
13718 // Insert the element into the desired chunk.
13719 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13720 assert(isPowerOf2_32(NumEltsIn128));
13721 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
13722 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
13724 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13725 DAG.getConstant(IdxIn128, dl, MVT::i32));
13727 // Insert the changed part back into the bigger vector
13728 return insert128BitVector(N0, V, IdxVal, DAG, dl);
13730 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13732 if (Subtarget.hasSSE41()) {
13733 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13735 if (VT == MVT::v8i16) {
13736 Opc = X86ISD::PINSRW;
13738 assert(VT == MVT::v16i8);
13739 Opc = X86ISD::PINSRB;
13742 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13744 if (N1.getValueType() != MVT::i32)
13745 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13746 if (N2.getValueType() != MVT::i32)
13747 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13748 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13751 if (EltVT == MVT::f32) {
13752 // Bits [7:6] of the constant are the source select. This will always be
13753 // zero here. The DAG Combiner may combine an extract_elt index into
13754 // these bits. For example (insert (extract, 3), 2) could be matched by
13755 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
13756 // Bits [5:4] of the constant are the destination select. This is the
13757 // value of the incoming immediate.
13758 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13759 // combine either bitwise AND or insert of float 0.0 to set these bits.
13761 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
13762 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
13763 // If this is an insertion of 32-bits into the low 32-bits of
13764 // a vector, we prefer to generate a blend with immediate rather
13765 // than an insertps. Blends are simpler operations in hardware and so
13766 // will always have equal or better performance than insertps.
13767 // But if optimizing for size and there's a load folding opportunity,
13768 // generate insertps because blendps does not have a 32-bit memory
13770 N2 = DAG.getIntPtrConstant(1, dl);
13771 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13772 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
13774 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
13775 // Create this as a scalar to vector..
13776 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13777 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13780 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13781 // PINSR* works with constant index.
13786 if (EltVT == MVT::i8)
13789 if (EltVT.getSizeInBits() == 16) {
13790 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13791 // as its second argument.
13792 if (N1.getValueType() != MVT::i32)
13793 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13794 if (N2.getValueType() != MVT::i32)
13795 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13796 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13801 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13803 MVT OpVT = Op.getSimpleValueType();
13805 // If this is a 256-bit vector result, first insert into a 128-bit
13806 // vector and then insert into the 256-bit vector.
13807 if (!OpVT.is128BitVector()) {
13808 // Insert into a 128-bit vector.
13809 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13810 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13811 OpVT.getVectorNumElements() / SizeFactor);
13813 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13815 // Insert the 128-bit vector.
13816 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13819 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13820 assert(OpVT.is128BitVector() && "Expected an SSE type!");
13821 return DAG.getBitcast(
13822 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
13825 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13826 // a simple subregister reference or explicit instructions to grab
13827 // upper bits of a vector.
13828 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13829 SelectionDAG &DAG) {
13830 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
13833 SDValue In = Op.getOperand(0);
13834 SDValue Idx = Op.getOperand(1);
13835 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13836 MVT ResVT = Op.getSimpleValueType();
13838 assert((In.getSimpleValueType().is256BitVector() ||
13839 In.getSimpleValueType().is512BitVector()) &&
13840 "Can only extract from 256-bit or 512-bit vectors");
13842 if (ResVT.is128BitVector())
13843 return extract128BitVector(In, IdxVal, DAG, dl);
13844 if (ResVT.is256BitVector())
13845 return extract256BitVector(In, IdxVal, DAG, dl);
13847 llvm_unreachable("Unimplemented!");
13850 static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
13851 for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
13852 if (llvm::all_of(ValidUsers,
13853 [&I](SDValue V) { return V.getNode() != *I; }))
13858 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13859 // simple superregister reference or explicit instructions to insert
13860 // the upper bits of a vector.
13861 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13862 SelectionDAG &DAG) {
13863 assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
13866 SDValue Vec = Op.getOperand(0);
13867 SDValue SubVec = Op.getOperand(1);
13868 SDValue Idx = Op.getOperand(2);
13870 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13871 MVT OpVT = Op.getSimpleValueType();
13872 MVT SubVecVT = SubVec.getSimpleValueType();
13874 if (OpVT.getVectorElementType() == MVT::i1)
13875 return insert1BitVector(Op, DAG, Subtarget);
13877 assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13878 "Can only insert into 256-bit or 512-bit vectors");
13880 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
13882 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13883 // (load16 addr + 16), Elts/2)
13886 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13887 // (load32 addr + 32), Elts/2)
13889 // or a 16-byte or 32-byte broadcast:
13890 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13891 // (load16 addr), Elts/2)
13892 // --> X86SubVBroadcast(load16 addr)
13894 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13895 // (load32 addr), Elts/2)
13896 // --> X86SubVBroadcast(load32 addr)
13897 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13898 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13899 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
13900 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
13901 if (Idx2 && Idx2->getZExtValue() == 0) {
13902 SDValue SubVec2 = Vec.getOperand(1);
13903 // If needed, look through bitcasts to get to the load.
13904 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
13906 unsigned Alignment = FirstLd->getAlignment();
13907 unsigned AS = FirstLd->getAddressSpace();
13908 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
13909 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
13910 OpVT, AS, Alignment, &Fast) && Fast) {
13911 SDValue Ops[] = {SubVec2, SubVec};
13912 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
13916 // If lower/upper loads are the same and the only users of the load, then
13917 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
13918 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
13919 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
13920 areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
13921 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
13924 // If this is subv_broadcast insert into both halves, use a larger
13926 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
13927 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
13928 SubVec.getOperand(0));
13933 if (SubVecVT.is128BitVector())
13934 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13936 if (SubVecVT.is256BitVector())
13937 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13939 llvm_unreachable("Unimplemented!");
13942 // Returns the appropriate wrapper opcode for a global reference.
13943 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
13944 // References to absolute symbols are never PC-relative.
13945 if (GV && GV->isAbsoluteSymbolRef())
13946 return X86ISD::Wrapper;
13948 CodeModel::Model M = getTargetMachine().getCodeModel();
13949 if (Subtarget.isPICStyleRIPRel() &&
13950 (M == CodeModel::Small || M == CodeModel::Kernel))
13951 return X86ISD::WrapperRIP;
13953 return X86ISD::Wrapper;
13956 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13957 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13958 // one of the above mentioned nodes. It has to be wrapped because otherwise
13959 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13960 // be used to form addressing mode. These wrapped nodes will be selected
13963 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13964 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13966 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13967 // global base reg.
13968 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13970 auto PtrVT = getPointerTy(DAG.getDataLayout());
13971 SDValue Result = DAG.getTargetConstantPool(
13972 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
13974 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13975 // With PIC, the address is actually $g + Offset.
13978 DAG.getNode(ISD::ADD, DL, PtrVT,
13979 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13985 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13986 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13988 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13989 // global base reg.
13990 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13992 auto PtrVT = getPointerTy(DAG.getDataLayout());
13993 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
13995 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13997 // With PIC, the address is actually $g + Offset.
14000 DAG.getNode(ISD::ADD, DL, PtrVT,
14001 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14007 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14008 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14010 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14011 // global base reg.
14012 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14013 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14015 auto PtrVT = getPointerTy(DAG.getDataLayout());
14016 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14019 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14021 // With PIC, the address is actually $g + Offset.
14022 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14024 DAG.getNode(ISD::ADD, DL, PtrVT,
14025 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14028 // For symbols that require a load from a stub to get the address, emit the
14030 if (isGlobalStubReference(OpFlag))
14031 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14032 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14038 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14039 // Create the TargetBlockAddressAddress node.
14040 unsigned char OpFlags =
14041 Subtarget.classifyBlockAddressReference();
14042 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14043 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14045 auto PtrVT = getPointerTy(DAG.getDataLayout());
14046 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14047 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14049 // With PIC, the address is actually $g + Offset.
14050 if (isGlobalRelativeToPICBase(OpFlags)) {
14051 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14052 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14058 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14059 const SDLoc &dl, int64_t Offset,
14060 SelectionDAG &DAG) const {
14061 // Create the TargetGlobalAddress node, folding in the constant
14062 // offset if it is legal.
14063 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14064 CodeModel::Model M = DAG.getTarget().getCodeModel();
14065 auto PtrVT = getPointerTy(DAG.getDataLayout());
14067 if (OpFlags == X86II::MO_NO_FLAG &&
14068 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14069 // A direct static reference to a global.
14070 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14073 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14076 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14078 // With PIC, the address is actually $g + Offset.
14079 if (isGlobalRelativeToPICBase(OpFlags)) {
14080 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14081 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14084 // For globals that require a load from a stub to get the address, emit the
14086 if (isGlobalStubReference(OpFlags))
14087 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14088 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14090 // If there was a non-zero offset that we didn't fold, create an explicit
14091 // addition for it.
14093 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14094 DAG.getConstant(Offset, dl, PtrVT));
14100 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14101 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14102 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14103 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14107 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14108 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14109 unsigned char OperandFlags, bool LocalDynamic = false) {
14110 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14111 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14113 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14114 GA->getValueType(0),
14118 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14122 SDValue Ops[] = { Chain, TGA, *InFlag };
14123 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14125 SDValue Ops[] = { Chain, TGA };
14126 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14129 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14130 MFI.setAdjustsStack(true);
14131 MFI.setHasCalls(true);
14133 SDValue Flag = Chain.getValue(1);
14134 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14137 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14139 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14142 SDLoc dl(GA); // ? function entry point might be better
14143 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14144 DAG.getNode(X86ISD::GlobalBaseReg,
14145 SDLoc(), PtrVT), InFlag);
14146 InFlag = Chain.getValue(1);
14148 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14151 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14153 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14155 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14156 X86::RAX, X86II::MO_TLSGD);
14159 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14165 // Get the start address of the TLS block for this module.
14166 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14167 .getInfo<X86MachineFunctionInfo>();
14168 MFI->incNumLocalDynamicTLSAccesses();
14172 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14173 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14176 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14177 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14178 InFlag = Chain.getValue(1);
14179 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14180 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14183 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14187 unsigned char OperandFlags = X86II::MO_DTPOFF;
14188 unsigned WrapperKind = X86ISD::Wrapper;
14189 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14190 GA->getValueType(0),
14191 GA->getOffset(), OperandFlags);
14192 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14194 // Add x@dtpoff with the base.
14195 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14198 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14199 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14200 const EVT PtrVT, TLSModel::Model model,
14201 bool is64Bit, bool isPIC) {
14204 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14205 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14206 is64Bit ? 257 : 256));
14208 SDValue ThreadPointer =
14209 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14210 MachinePointerInfo(Ptr));
14212 unsigned char OperandFlags = 0;
14213 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14215 unsigned WrapperKind = X86ISD::Wrapper;
14216 if (model == TLSModel::LocalExec) {
14217 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14218 } else if (model == TLSModel::InitialExec) {
14220 OperandFlags = X86II::MO_GOTTPOFF;
14221 WrapperKind = X86ISD::WrapperRIP;
14223 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14226 llvm_unreachable("Unexpected model");
14229 // emit "addl x@ntpoff,%eax" (local exec)
14230 // or "addl x@indntpoff,%eax" (initial exec)
14231 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14233 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14234 GA->getOffset(), OperandFlags);
14235 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14237 if (model == TLSModel::InitialExec) {
14238 if (isPIC && !is64Bit) {
14239 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14240 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14244 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14245 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14248 // The address of the thread local variable is the add of the thread
14249 // pointer with the offset of the variable.
14250 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14254 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14256 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14258 if (DAG.getTarget().Options.EmulatedTLS)
14259 return LowerToTLSEmulatedModel(GA, DAG);
14261 const GlobalValue *GV = GA->getGlobal();
14262 auto PtrVT = getPointerTy(DAG.getDataLayout());
14263 bool PositionIndependent = isPositionIndependent();
14265 if (Subtarget.isTargetELF()) {
14266 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14268 case TLSModel::GeneralDynamic:
14269 if (Subtarget.is64Bit())
14270 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14271 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14272 case TLSModel::LocalDynamic:
14273 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14274 Subtarget.is64Bit());
14275 case TLSModel::InitialExec:
14276 case TLSModel::LocalExec:
14277 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14278 PositionIndependent);
14280 llvm_unreachable("Unknown TLS model.");
14283 if (Subtarget.isTargetDarwin()) {
14284 // Darwin only has one model of TLS. Lower to that.
14285 unsigned char OpFlag = 0;
14286 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14287 X86ISD::WrapperRIP : X86ISD::Wrapper;
14289 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14290 // global base reg.
14291 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14293 OpFlag = X86II::MO_TLVP_PIC_BASE;
14295 OpFlag = X86II::MO_TLVP;
14297 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14298 GA->getValueType(0),
14299 GA->getOffset(), OpFlag);
14300 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14302 // With PIC32, the address is actually $g + Offset.
14304 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14305 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14308 // Lowering the machine isd will make sure everything is in the right
14310 SDValue Chain = DAG.getEntryNode();
14311 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14312 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14313 SDValue Args[] = { Chain, Offset };
14314 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14315 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14316 DAG.getIntPtrConstant(0, DL, true),
14317 Chain.getValue(1), DL);
14319 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14320 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14321 MFI.setAdjustsStack(true);
14323 // And our return value (tls address) is in the standard call return value
14325 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14326 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14329 if (Subtarget.isTargetKnownWindowsMSVC() ||
14330 Subtarget.isTargetWindowsItanium() ||
14331 Subtarget.isTargetWindowsGNU()) {
14332 // Just use the implicit TLS architecture
14333 // Need to generate someting similar to:
14334 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14336 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14337 // mov rcx, qword [rdx+rcx*8]
14338 // mov eax, .tls$:tlsvar
14339 // [rax+rcx] contains the address
14340 // Windows 64bit: gs:0x58
14341 // Windows 32bit: fs:__tls_array
14344 SDValue Chain = DAG.getEntryNode();
14346 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14347 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14348 // use its literal value of 0x2C.
14349 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14350 ? Type::getInt8PtrTy(*DAG.getContext(),
14352 : Type::getInt32PtrTy(*DAG.getContext(),
14355 SDValue TlsArray = Subtarget.is64Bit()
14356 ? DAG.getIntPtrConstant(0x58, dl)
14357 : (Subtarget.isTargetWindowsGNU()
14358 ? DAG.getIntPtrConstant(0x2C, dl)
14359 : DAG.getExternalSymbol("_tls_array", PtrVT));
14361 SDValue ThreadPointer =
14362 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14365 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14366 res = ThreadPointer;
14368 // Load the _tls_index variable
14369 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14370 if (Subtarget.is64Bit())
14371 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14372 MachinePointerInfo(), MVT::i32);
14374 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14376 auto &DL = DAG.getDataLayout();
14378 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14379 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14381 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14384 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14386 // Get the offset of start of .tls section
14387 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14388 GA->getValueType(0),
14389 GA->getOffset(), X86II::MO_SECREL);
14390 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14392 // The address of the thread local variable is the add of the thread
14393 // pointer with the offset of the variable.
14394 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14397 llvm_unreachable("TLS not implemented for this target.");
14400 /// Lower SRA_PARTS and friends, which return two i32 values
14401 /// and take a 2 x i32 value to shift plus a shift amount.
14402 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14403 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14404 MVT VT = Op.getSimpleValueType();
14405 unsigned VTBits = VT.getSizeInBits();
14407 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14408 SDValue ShOpLo = Op.getOperand(0);
14409 SDValue ShOpHi = Op.getOperand(1);
14410 SDValue ShAmt = Op.getOperand(2);
14411 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14412 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14414 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14415 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14416 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14417 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14418 : DAG.getConstant(0, dl, VT);
14420 SDValue Tmp2, Tmp3;
14421 if (Op.getOpcode() == ISD::SHL_PARTS) {
14422 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14423 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14425 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14426 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14429 // If the shift amount is larger or equal than the width of a part we can't
14430 // rely on the results of shld/shrd. Insert a test and select the appropriate
14431 // values for large shift amounts.
14432 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14433 DAG.getConstant(VTBits, dl, MVT::i8));
14434 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14435 AndNode, DAG.getConstant(0, dl, MVT::i8));
14438 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14439 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14440 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14442 if (Op.getOpcode() == ISD::SHL_PARTS) {
14443 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14444 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14446 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14447 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14450 SDValue Ops[2] = { Lo, Hi };
14451 return DAG.getMergeValues(Ops, dl);
14454 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14455 SelectionDAG &DAG) const {
14456 SDValue Src = Op.getOperand(0);
14457 MVT SrcVT = Src.getSimpleValueType();
14458 MVT VT = Op.getSimpleValueType();
14461 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14462 if (SrcVT.isVector()) {
14463 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14464 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14465 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14466 DAG.getUNDEF(SrcVT)));
14468 if (SrcVT.getVectorElementType() == MVT::i1) {
14469 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14470 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14471 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14472 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14473 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14474 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14479 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14480 "Unknown SINT_TO_FP to lower!");
14482 // These are really Legal; return the operand so the caller accepts it as
14484 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14486 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14487 Subtarget.is64Bit()) {
14491 SDValue ValueToStore = Op.getOperand(0);
14492 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14493 !Subtarget.is64Bit())
14494 // Bitcasting to f64 here allows us to do a single 64-bit store from
14495 // an SSE register, avoiding the store forwarding penalty that would come
14496 // with two 32-bit stores.
14497 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14499 unsigned Size = SrcVT.getSizeInBits()/8;
14500 MachineFunction &MF = DAG.getMachineFunction();
14501 auto PtrVT = getPointerTy(MF.getDataLayout());
14502 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14503 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14504 SDValue Chain = DAG.getStore(
14505 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14506 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14507 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14510 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14512 SelectionDAG &DAG) const {
14516 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14518 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14520 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14522 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14524 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14525 MachineMemOperand *MMO;
14527 int SSFI = FI->getIndex();
14528 MMO = DAG.getMachineFunction().getMachineMemOperand(
14529 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14530 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14532 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14533 StackSlot = StackSlot.getOperand(1);
14535 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14536 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14538 Tys, Ops, SrcVT, MMO);
14541 Chain = Result.getValue(1);
14542 SDValue InFlag = Result.getValue(2);
14544 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14545 // shouldn't be necessary except that RFP cannot be live across
14546 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14547 MachineFunction &MF = DAG.getMachineFunction();
14548 unsigned SSFISize = Op.getValueSizeInBits()/8;
14549 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
14550 auto PtrVT = getPointerTy(MF.getDataLayout());
14551 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14552 Tys = DAG.getVTList(MVT::Other);
14554 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14556 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14557 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14558 MachineMemOperand::MOStore, SSFISize, SSFISize);
14560 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14561 Ops, Op.getValueType(), MMO);
14562 Result = DAG.getLoad(
14563 Op.getValueType(), DL, Chain, StackSlot,
14564 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14570 /// 64-bit unsigned integer to double expansion.
14571 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14572 SelectionDAG &DAG) const {
14573 // This algorithm is not obvious. Here it is what we're trying to output:
14576 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14577 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14579 haddpd %xmm0, %xmm0
14581 pshufd $0x4e, %xmm0, %xmm1
14587 LLVMContext *Context = DAG.getContext();
14589 // Build some magic constants.
14590 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14591 Constant *C0 = ConstantDataVector::get(*Context, CV0);
14592 auto PtrVT = getPointerTy(DAG.getDataLayout());
14593 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
14595 SmallVector<Constant*,2> CV1;
14597 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14598 APInt(64, 0x4330000000000000ULL))));
14600 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14601 APInt(64, 0x4530000000000000ULL))));
14602 Constant *C1 = ConstantVector::get(CV1);
14603 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
14605 // Load the 64-bit value into an XMM register.
14606 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14609 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14610 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14611 /* Alignment = */ 16);
14613 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
14616 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14617 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14618 /* Alignment = */ 16);
14619 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
14620 // TODO: Are there any fast-math-flags to propagate here?
14621 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14624 if (Subtarget.hasSSE3()) {
14625 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14626 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14628 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
14629 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
14630 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14631 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
14634 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14635 DAG.getIntPtrConstant(0, dl));
14638 /// 32-bit unsigned integer to float expansion.
14639 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14640 SelectionDAG &DAG) const {
14642 // FP constant to bias correct the final result.
14643 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
14646 // Load the 32-bit value into an XMM register.
14647 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14650 // Zero out the upper parts of the register.
14651 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14653 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14654 DAG.getBitcast(MVT::v2f64, Load),
14655 DAG.getIntPtrConstant(0, dl));
14657 // Or the load with the bias.
14658 SDValue Or = DAG.getNode(
14659 ISD::OR, dl, MVT::v2i64,
14660 DAG.getBitcast(MVT::v2i64,
14661 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
14662 DAG.getBitcast(MVT::v2i64,
14663 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
14665 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14666 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
14668 // Subtract the bias.
14669 // TODO: Are there any fast-math-flags to propagate here?
14670 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14672 // Handle final rounding.
14673 MVT DestVT = Op.getSimpleValueType();
14675 if (DestVT.bitsLT(MVT::f64))
14676 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14677 DAG.getIntPtrConstant(0, dl));
14678 if (DestVT.bitsGT(MVT::f64))
14679 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14681 // Handle final rounding.
14685 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
14686 const X86Subtarget &Subtarget, SDLoc &DL) {
14687 if (Op.getSimpleValueType() != MVT::v2f64)
14690 SDValue N0 = Op.getOperand(0);
14691 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
14693 // Legalize to v4i32 type.
14694 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
14695 DAG.getUNDEF(MVT::v2i32));
14697 if (Subtarget.hasAVX512())
14698 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
14700 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
14701 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
14702 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
14703 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
14705 // Two to the power of half-word-size.
14706 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
14708 // Clear upper part of LO, lower HI.
14709 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
14710 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
14712 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
14713 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
14714 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
14716 // Add the two halves.
14717 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
14720 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14721 const X86Subtarget &Subtarget) {
14722 // The algorithm is the following:
14723 // #ifdef __SSE4_1__
14724 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14725 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14726 // (uint4) 0x53000000, 0xaa);
14728 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14729 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14731 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14732 // return (float4) lo + fhi;
14734 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
14735 // reassociate the two FADDs, and if we do that, the algorithm fails
14736 // spectacularly (PR24512).
14737 // FIXME: If we ever have some kind of Machine FMF, this should be marked
14738 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
14739 // there's also the MachineCombiner reassociations happening on Machine IR.
14740 if (DAG.getTarget().Options.UnsafeFPMath)
14744 SDValue V = Op->getOperand(0);
14745 MVT VecIntVT = V.getSimpleValueType();
14746 bool Is128 = VecIntVT == MVT::v4i32;
14747 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14748 // If we convert to something else than the supported type, e.g., to v4f64,
14750 if (VecFloatVT != Op->getSimpleValueType(0))
14753 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14754 "Unsupported custom type");
14756 // In the #idef/#else code, we have in common:
14757 // - The vector of constants:
14763 // Create the splat vector for 0x4b000000.
14764 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
14765 // Create the splat vector for 0x53000000.
14766 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
14768 // Create the right shift.
14769 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
14770 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14773 if (Subtarget.hasSSE41()) {
14774 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14775 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14776 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
14777 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
14778 // Low will be bitcasted right away, so do not bother bitcasting back to its
14780 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14781 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14782 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14783 // (uint4) 0x53000000, 0xaa);
14784 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
14785 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
14786 // High will be bitcasted right away, so do not bother bitcasting back to
14787 // its original type.
14788 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14789 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14791 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
14792 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14793 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14794 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14796 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14797 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14800 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14801 SDValue VecCstFAdd = DAG.getConstantFP(
14802 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
14804 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14805 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
14806 // TODO: Are there any fast-math-flags to propagate here?
14808 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14809 // return (float4) lo + fhi;
14810 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
14811 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14814 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14815 SelectionDAG &DAG) const {
14816 SDValue N0 = Op.getOperand(0);
14817 MVT SrcVT = N0.getSimpleValueType();
14820 if (SrcVT.getVectorElementType() == MVT::i1) {
14821 if (SrcVT == MVT::v2i1)
14822 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14823 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
14824 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14825 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14826 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
14829 switch (SrcVT.SimpleTy) {
14831 llvm_unreachable("Custom UINT_TO_FP is not supported!");
14836 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14837 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14838 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14841 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
14844 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
14847 assert(Subtarget.hasAVX512());
14848 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14849 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
14853 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14854 SelectionDAG &DAG) const {
14855 SDValue N0 = Op.getOperand(0);
14857 auto PtrVT = getPointerTy(DAG.getDataLayout());
14859 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14860 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14861 // the optimization here.
14862 if (DAG.SignBitIsZero(N0))
14863 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14865 if (Op.getSimpleValueType().isVector())
14866 return lowerUINT_TO_FP_vec(Op, DAG);
14868 MVT SrcVT = N0.getSimpleValueType();
14869 MVT DstVT = Op.getSimpleValueType();
14871 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
14872 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
14873 // Conversions from unsigned i32 to f32/f64 are legal,
14874 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
14878 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14879 return LowerUINT_TO_FP_i64(Op, DAG);
14880 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14881 return LowerUINT_TO_FP_i32(Op, DAG);
14882 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14885 // Make a 64-bit buffer, and use it to build an FILD.
14886 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14887 if (SrcVT == MVT::i32) {
14888 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
14889 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14890 StackSlot, MachinePointerInfo());
14891 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
14892 OffsetSlot, MachinePointerInfo());
14893 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14897 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14898 SDValue ValueToStore = Op.getOperand(0);
14899 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
14900 // Bitcasting to f64 here allows us to do a single 64-bit store from
14901 // an SSE register, avoiding the store forwarding penalty that would come
14902 // with two 32-bit stores.
14903 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14904 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14905 MachinePointerInfo());
14906 // For i64 source, we need to add the appropriate power of 2 if the input
14907 // was negative. This is the same as the optimization in
14908 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14909 // we must be careful to do the computation in x87 extended precision, not
14910 // in SSE. (The generic code can't know it's OK to do this, or how to.)
14911 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14912 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14913 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14914 MachineMemOperand::MOLoad, 8, 8);
14916 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14917 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14918 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14921 APInt FF(32, 0x5F800000ULL);
14923 // Check whether the sign bit is set.
14924 SDValue SignSet = DAG.getSetCC(
14925 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
14926 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
14928 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14929 SDValue FudgePtr = DAG.getConstantPool(
14930 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
14932 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14933 SDValue Zero = DAG.getIntPtrConstant(0, dl);
14934 SDValue Four = DAG.getIntPtrConstant(4, dl);
14935 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14937 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
14939 // Load the value out, extending it from f32 to f80.
14940 // FIXME: Avoid the extend by constructing the right constant pool?
14941 SDValue Fudge = DAG.getExtLoad(
14942 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
14943 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
14944 /* Alignment = */ 4);
14945 // Extend everything to 80 bits to force it to be done on x87.
14946 // TODO: Are there any fast-math-flags to propagate here?
14947 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14948 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
14949 DAG.getIntPtrConstant(0, dl));
14952 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
14953 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
14954 // just return an <SDValue(), SDValue()> pair.
14955 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
14956 // to i16, i32 or i64, and we lower it to a legal sequence.
14957 // If lowered to the final integer result we return a <result, SDValue()> pair.
14958 // Otherwise we lower it to a sequence ending with a FIST, return a
14959 // <FIST, StackSlot> pair, and the caller is responsible for loading
14960 // the final integer result from StackSlot.
14961 std::pair<SDValue,SDValue>
14962 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14963 bool IsSigned, bool IsReplace) const {
14966 EVT DstTy = Op.getValueType();
14967 EVT TheVT = Op.getOperand(0).getValueType();
14968 auto PtrVT = getPointerTy(DAG.getDataLayout());
14970 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
14971 // f16 must be promoted before using the lowering in this routine.
14972 // fp128 does not use this lowering.
14973 return std::make_pair(SDValue(), SDValue());
14976 // If using FIST to compute an unsigned i64, we'll need some fixup
14977 // to handle values above the maximum signed i64. A FIST is always
14978 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
14979 bool UnsignedFixup = !IsSigned &&
14980 DstTy == MVT::i64 &&
14981 (!Subtarget.is64Bit() ||
14982 !isScalarFPTypeInSSEReg(TheVT));
14984 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
14985 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
14986 // The low 32 bits of the fist result will have the correct uint32 result.
14987 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14991 assert(DstTy.getSimpleVT() <= MVT::i64 &&
14992 DstTy.getSimpleVT() >= MVT::i16 &&
14993 "Unknown FP_TO_INT to lower!");
14995 // These are really Legal.
14996 if (DstTy == MVT::i32 &&
14997 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14998 return std::make_pair(SDValue(), SDValue());
14999 if (Subtarget.is64Bit() &&
15000 DstTy == MVT::i64 &&
15001 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15002 return std::make_pair(SDValue(), SDValue());
15004 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15006 MachineFunction &MF = DAG.getMachineFunction();
15007 unsigned MemSize = DstTy.getSizeInBits()/8;
15008 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15009 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15012 switch (DstTy.getSimpleVT().SimpleTy) {
15013 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15014 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15015 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15016 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15019 SDValue Chain = DAG.getEntryNode();
15020 SDValue Value = Op.getOperand(0);
15021 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15023 if (UnsignedFixup) {
15025 // Conversion to unsigned i64 is implemented with a select,
15026 // depending on whether the source value fits in the range
15027 // of a signed i64. Let Thresh be the FP equivalent of
15028 // 0x8000000000000000ULL.
15030 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15031 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15032 // Fist-to-mem64 FistSrc
15033 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15034 // to XOR'ing the high 32 bits with Adjust.
15036 // Being a power of 2, Thresh is exactly representable in all FP formats.
15037 // For X87 we'd like to use the smallest FP type for this constant, but
15038 // for DAG type consistency we have to match the FP operand type.
15040 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15041 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15042 bool LosesInfo = false;
15043 if (TheVT == MVT::f64)
15044 // The rounding mode is irrelevant as the conversion should be exact.
15045 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15047 else if (TheVT == MVT::f80)
15048 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15049 APFloat::rmNearestTiesToEven, &LosesInfo);
15051 assert(Status == APFloat::opOK && !LosesInfo &&
15052 "FP conversion should have been exact");
15054 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15056 SDValue Cmp = DAG.getSetCC(DL,
15057 getSetCCResultType(DAG.getDataLayout(),
15058 *DAG.getContext(), TheVT),
15059 Value, ThreshVal, ISD::SETLT);
15060 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15061 DAG.getConstant(0, DL, MVT::i32),
15062 DAG.getConstant(0x80000000, DL, MVT::i32));
15063 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15064 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15065 *DAG.getContext(), TheVT),
15066 Value, ThreshVal, ISD::SETLT);
15067 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15070 // FIXME This causes a redundant load/store if the SSE-class value is already
15071 // in memory, such as if it is on the callstack.
15072 if (isScalarFPTypeInSSEReg(TheVT)) {
15073 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15074 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15075 MachinePointerInfo::getFixedStack(MF, SSFI));
15076 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15078 Chain, StackSlot, DAG.getValueType(TheVT)
15081 MachineMemOperand *MMO =
15082 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15083 MachineMemOperand::MOLoad, MemSize, MemSize);
15084 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15085 Chain = Value.getValue(1);
15086 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15087 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15090 MachineMemOperand *MMO =
15091 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15092 MachineMemOperand::MOStore, MemSize, MemSize);
15094 if (UnsignedFixup) {
15096 // Insert the FIST, load its result as two i32's,
15097 // and XOR the high i32 with Adjust.
15099 SDValue FistOps[] = { Chain, Value, StackSlot };
15100 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15101 FistOps, DstTy, MMO);
15104 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15105 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15108 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15109 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15111 if (Subtarget.is64Bit()) {
15112 // Join High32 and Low32 into a 64-bit result.
15113 // (High32 << 32) | Low32
15114 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15115 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15116 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15117 DAG.getConstant(32, DL, MVT::i8));
15118 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15119 return std::make_pair(Result, SDValue());
15122 SDValue ResultOps[] = { Low32, High32 };
15124 SDValue pair = IsReplace
15125 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15126 : DAG.getMergeValues(ResultOps, DL);
15127 return std::make_pair(pair, SDValue());
15129 // Build the FP_TO_INT*_IN_MEM
15130 SDValue Ops[] = { Chain, Value, StackSlot };
15131 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15133 return std::make_pair(FIST, StackSlot);
15137 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15138 const X86Subtarget &Subtarget) {
15139 MVT VT = Op->getSimpleValueType(0);
15140 SDValue In = Op->getOperand(0);
15141 MVT InVT = In.getSimpleValueType();
15144 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15145 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15147 // Optimize vectors in AVX mode:
15150 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15151 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15152 // Concat upper and lower parts.
15155 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15156 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15157 // Concat upper and lower parts.
15160 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15161 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15162 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15165 if (Subtarget.hasInt256())
15166 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15168 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15169 SDValue Undef = DAG.getUNDEF(InVT);
15170 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15171 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15172 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15174 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15175 VT.getVectorNumElements()/2);
15177 OpLo = DAG.getBitcast(HVT, OpLo);
15178 OpHi = DAG.getBitcast(HVT, OpHi);
15180 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15183 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15184 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15185 MVT VT = Op->getSimpleValueType(0);
15186 SDValue In = Op->getOperand(0);
15187 MVT InVT = In.getSimpleValueType();
15189 unsigned NumElts = VT.getVectorNumElements();
15190 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
15193 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
15194 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15196 assert(InVT.getVectorElementType() == MVT::i1);
15198 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15200 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15201 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15204 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15206 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15208 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15210 return SelectedVal;
15211 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15214 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15215 SelectionDAG &DAG) {
15216 if (Subtarget.hasFp256())
15217 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15223 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15224 SelectionDAG &DAG) {
15226 MVT VT = Op.getSimpleValueType();
15227 SDValue In = Op.getOperand(0);
15228 MVT SVT = In.getSimpleValueType();
15230 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15231 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15233 if (Subtarget.hasFp256())
15234 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15237 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15238 VT.getVectorNumElements() != SVT.getVectorNumElements());
15242 /// Helper to recursively truncate vector elements in half with PACKSS.
15243 /// It makes use of the fact that vector comparison results will be all-zeros
15244 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15245 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15246 /// within each 128-bit lane.
15247 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15250 const X86Subtarget &Subtarget) {
15251 // Requires SSE2 but AVX512 has fast truncate.
15252 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15255 EVT SrcVT = In.getValueType();
15257 // No truncation required, we might get here due to recursive calls.
15258 if (SrcVT == DstVT)
15261 // We only support vector truncation to 128bits or greater from a
15262 // 256bits or greater source.
15263 if ((DstVT.getSizeInBits() % 128) != 0)
15265 if ((SrcVT.getSizeInBits() % 256) != 0)
15268 unsigned NumElems = SrcVT.getVectorNumElements();
15269 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15270 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15273 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15275 // Extract lower/upper subvectors.
15276 unsigned NumSubElts = NumElems / 2;
15277 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15278 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15279 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15281 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15282 if (SrcVT.is256BitVector()) {
15283 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15284 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15285 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15286 return DAG.getBitcast(DstVT, Res);
15289 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15290 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15291 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15292 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15293 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15294 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15296 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15297 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15298 Res = DAG.getBitcast(MVT::v4i64, Res);
15299 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15301 if (DstVT.is256BitVector())
15302 return DAG.getBitcast(DstVT, Res);
15304 // If 512bit -> 128bit truncate another stage.
15305 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15306 Res = DAG.getBitcast(PackedVT, Res);
15307 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15310 // Recursively pack lower/upper subvectors, concat result and pack again.
15311 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15312 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15313 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15314 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15316 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15317 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15318 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15321 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15322 const X86Subtarget &Subtarget) {
15325 MVT VT = Op.getSimpleValueType();
15326 SDValue In = Op.getOperand(0);
15327 MVT InVT = In.getSimpleValueType();
15329 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15331 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15332 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15333 if (InVT.getScalarSizeInBits() <= 16) {
15334 if (Subtarget.hasBWI()) {
15335 // legal, will go to VPMOVB2M, VPMOVW2M
15336 // Shift packed bytes not supported natively, bitcast to word
15337 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15338 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15339 DAG.getBitcast(ExtVT, In),
15340 DAG.getConstant(ShiftInx, DL, ExtVT));
15341 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15342 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15344 // Use TESTD/Q, extended vector to packed dword/qword.
15345 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15346 "Unexpected vector type.");
15347 unsigned NumElts = InVT.getVectorNumElements();
15348 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15349 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15351 ShiftInx = InVT.getScalarSizeInBits() - 1;
15354 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15355 DAG.getConstant(ShiftInx, DL, InVT));
15356 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15359 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15361 MVT VT = Op.getSimpleValueType();
15362 SDValue In = Op.getOperand(0);
15363 MVT InVT = In.getSimpleValueType();
15365 if (VT == MVT::i1) {
15366 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15367 "Invalid scalar TRUNCATE operation");
15368 if (InVT.getSizeInBits() >= 32)
15370 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15371 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15373 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15374 "Invalid TRUNCATE operation");
15376 if (VT.getVectorElementType() == MVT::i1)
15377 return LowerTruncateVecI1(Op, DAG, Subtarget);
15379 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15380 if (Subtarget.hasAVX512()) {
15381 // word to byte only under BWI
15382 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15383 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15384 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
15385 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15388 // Truncate with PACKSS if we are truncating a vector comparison result.
15389 // TODO: We should be able to support other operations as long as we
15390 // we are saturating+packing zero/all bits only.
15391 auto IsPackableComparison = [](SDValue V) {
15392 unsigned Opcode = V.getOpcode();
15393 return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
15394 Opcode == X86ISD::CMPP);
15397 if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
15398 all_of(In->ops(), IsPackableComparison))) {
15399 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15403 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15404 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15405 if (Subtarget.hasInt256()) {
15406 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15407 In = DAG.getBitcast(MVT::v8i32, In);
15408 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
15410 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15411 DAG.getIntPtrConstant(0, DL));
15414 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15415 DAG.getIntPtrConstant(0, DL));
15416 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15417 DAG.getIntPtrConstant(2, DL));
15418 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15419 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15420 static const int ShufMask[] = {0, 2, 4, 6};
15421 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15424 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15425 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
15426 if (Subtarget.hasInt256()) {
15427 In = DAG.getBitcast(MVT::v32i8, In);
15429 SmallVector<SDValue,32> pshufbMask;
15430 for (unsigned i = 0; i < 2; ++i) {
15431 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
15432 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
15433 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
15434 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
15435 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
15436 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
15437 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
15438 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
15439 for (unsigned j = 0; j < 8; ++j)
15440 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
15442 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
15443 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
15444 In = DAG.getBitcast(MVT::v4i64, In);
15446 static const int ShufMask[] = {0, 2, -1, -1};
15447 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
15449 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15450 DAG.getIntPtrConstant(0, DL));
15451 return DAG.getBitcast(VT, In);
15454 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15455 DAG.getIntPtrConstant(0, DL));
15457 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15458 DAG.getIntPtrConstant(4, DL));
15460 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15461 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15463 // The PSHUFB mask:
15464 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15465 -1, -1, -1, -1, -1, -1, -1, -1};
15467 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
15468 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
15469 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
15471 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15472 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15474 // The MOVLHPS Mask:
15475 static const int ShufMask2[] = {0, 1, 4, 5};
15476 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15477 return DAG.getBitcast(MVT::v8i16, res);
15480 // Handle truncation of V256 to V128 using shuffles.
15481 if (!VT.is128BitVector() || !InVT.is256BitVector())
15484 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15486 unsigned NumElems = VT.getVectorNumElements();
15487 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15489 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15490 // Prepare truncation shuffle mask
15491 for (unsigned i = 0; i != NumElems; ++i)
15492 MaskVec[i] = i * 2;
15493 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
15494 DAG.getUNDEF(NVT), MaskVec);
15495 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15496 DAG.getIntPtrConstant(0, DL));
15499 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
15500 const X86Subtarget &Subtarget,
15501 SelectionDAG &DAG) const {
15502 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15504 MVT VT = Op.getSimpleValueType();
15506 if (VT.isVector()) {
15507 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15508 SDValue Src = Op.getOperand(0);
15510 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15511 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
15513 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15514 DAG.getUNDEF(MVT::v2f32)));
15520 assert(!VT.isVector());
15522 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15523 IsSigned, /*IsReplace=*/ false);
15524 SDValue FIST = Vals.first, StackSlot = Vals.second;
15525 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15526 if (!FIST.getNode())
15529 if (StackSlot.getNode())
15530 // Load the result.
15531 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15533 // The node is the result.
15537 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15539 MVT VT = Op.getSimpleValueType();
15540 SDValue In = Op.getOperand(0);
15541 MVT SVT = In.getSimpleValueType();
15543 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15545 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15546 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15547 In, DAG.getUNDEF(SVT)));
15550 /// The only differences between FABS and FNEG are the mask and the logic op.
15551 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15552 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15553 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15554 "Wrong opcode for lowering FABS or FNEG.");
15556 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15558 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15559 // into an FNABS. We'll lower the FABS after that if it is still in use.
15561 for (SDNode *User : Op->uses())
15562 if (User->getOpcode() == ISD::FNEG)
15566 MVT VT = Op.getSimpleValueType();
15568 bool IsF128 = (VT == MVT::f128);
15570 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15571 // decide if we should generate a 16-byte constant mask when we only need 4 or
15572 // 8 bytes for the scalar case.
15577 if (VT.isVector()) {
15579 EltVT = VT.getVectorElementType();
15580 } else if (IsF128) {
15581 // SSE instructions are used for optimized f128 logical operations.
15582 LogicVT = MVT::f128;
15585 // There are no scalar bitwise logical SSE/AVX instructions, so we
15586 // generate a 16-byte vector constant and logic op even for the scalar case.
15587 // Using a 16-byte mask allows folding the load of the mask with
15588 // the logic op, so it can save (~4 bytes) on code size.
15589 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15593 unsigned EltBits = EltVT.getSizeInBits();
15594 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
15596 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
15597 const fltSemantics &Sem =
15598 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
15599 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15600 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
15602 SDValue Op0 = Op.getOperand(0);
15603 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
15605 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15606 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15608 if (VT.isVector() || IsF128)
15609 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15611 // For the scalar case extend to a 128-bit vector, perform the logic op,
15612 // and extract the scalar result back out.
15613 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
15614 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15615 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
15616 DAG.getIntPtrConstant(0, dl));
15619 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
15620 SDValue Mag = Op.getOperand(0);
15621 SDValue Sign = Op.getOperand(1);
15624 // If the sign operand is smaller, extend it first.
15625 MVT VT = Op.getSimpleValueType();
15626 if (Sign.getSimpleValueType().bitsLT(VT))
15627 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
15629 // And if it is bigger, shrink it first.
15630 if (Sign.getSimpleValueType().bitsGT(VT))
15631 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
15633 // At this point the operands and the result should have the same
15634 // type, and that won't be f80 since that is not custom lowered.
15635 bool IsF128 = (VT == MVT::f128);
15636 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
15637 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
15638 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
15639 "Unexpected type in LowerFCOPYSIGN");
15641 MVT EltVT = VT.getScalarType();
15642 const fltSemantics &Sem =
15643 EltVT == MVT::f64 ? APFloat::IEEEdouble()
15644 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15646 // Perform all scalar logic operations as 16-byte vectors because there are no
15647 // scalar FP logic instructions in SSE.
15648 // TODO: This isn't necessary. If we used scalar types, we might avoid some
15649 // unnecessary splats, but we might miss load folding opportunities. Should
15650 // this decision be based on OptimizeForSize?
15651 bool IsFakeVector = !VT.isVector() && !IsF128;
15654 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15656 // The mask constants are automatically splatted for vector types.
15657 unsigned EltSizeInBits = VT.getScalarSizeInBits();
15658 SDValue SignMask = DAG.getConstantFP(
15659 APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15660 SDValue MagMask = DAG.getConstantFP(
15661 APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15663 // First, clear all bits but the sign bit from the second operand (sign).
15665 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
15666 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
15668 // Next, clear the sign bit from the first operand (magnitude).
15669 // TODO: If we had general constant folding for FP logic ops, this check
15670 // wouldn't be necessary.
15672 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
15673 APFloat APF = Op0CN->getValueAPF();
15675 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
15677 // If the magnitude operand wasn't a constant, we need to AND out the sign.
15679 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
15680 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
15683 // OR the magnitude value with the sign bit.
15684 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
15685 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
15686 DAG.getIntPtrConstant(0, dl));
15689 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
15690 SDValue N0 = Op.getOperand(0);
15692 MVT VT = Op.getSimpleValueType();
15694 MVT OpVT = N0.getSimpleValueType();
15695 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
15696 "Unexpected type for FGETSIGN");
15698 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
15699 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
15700 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
15701 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
15702 Res = DAG.getZExtOrTrunc(Res, dl, VT);
15703 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
15707 // Check whether an OR'd tree is PTEST-able.
15708 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
15709 SelectionDAG &DAG) {
15710 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
15712 if (!Subtarget.hasSSE41())
15715 if (!Op->hasOneUse())
15718 SDNode *N = Op.getNode();
15721 SmallVector<SDValue, 8> Opnds;
15722 DenseMap<SDValue, unsigned> VecInMap;
15723 SmallVector<SDValue, 8> VecIns;
15724 EVT VT = MVT::Other;
15726 // Recognize a special case where a vector is casted into wide integer to
15728 Opnds.push_back(N->getOperand(0));
15729 Opnds.push_back(N->getOperand(1));
15731 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
15732 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
15733 // BFS traverse all OR'd operands.
15734 if (I->getOpcode() == ISD::OR) {
15735 Opnds.push_back(I->getOperand(0));
15736 Opnds.push_back(I->getOperand(1));
15737 // Re-evaluate the number of nodes to be traversed.
15738 e += 2; // 2 more nodes (LHS and RHS) are pushed.
15742 // Quit if a non-EXTRACT_VECTOR_ELT
15743 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15746 // Quit if without a constant index.
15747 SDValue Idx = I->getOperand(1);
15748 if (!isa<ConstantSDNode>(Idx))
15751 SDValue ExtractedFromVec = I->getOperand(0);
15752 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
15753 if (M == VecInMap.end()) {
15754 VT = ExtractedFromVec.getValueType();
15755 // Quit if not 128/256-bit vector.
15756 if (!VT.is128BitVector() && !VT.is256BitVector())
15758 // Quit if not the same type.
15759 if (VecInMap.begin() != VecInMap.end() &&
15760 VT != VecInMap.begin()->first.getValueType())
15762 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
15763 VecIns.push_back(ExtractedFromVec);
15765 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
15768 assert((VT.is128BitVector() || VT.is256BitVector()) &&
15769 "Not extracted from 128-/256-bit vector.");
15771 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15773 for (DenseMap<SDValue, unsigned>::const_iterator
15774 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15775 // Quit if not all elements are used.
15776 if (I->second != FullMask)
15780 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15782 // Cast all vectors into TestVT for PTEST.
15783 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15784 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
15786 // If more than one full vectors are evaluated, OR them first before PTEST.
15787 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15788 // Each iteration will OR 2 nodes and append the result until there is only
15789 // 1 node left, i.e. the final OR'd value of all vectors.
15790 SDValue LHS = VecIns[Slot];
15791 SDValue RHS = VecIns[Slot + 1];
15792 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15795 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15796 VecIns.back(), VecIns.back());
15799 /// \brief return true if \c Op has a use that doesn't just read flags.
15800 static bool hasNonFlagsUse(SDValue Op) {
15801 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15803 SDNode *User = *UI;
15804 unsigned UOpNo = UI.getOperandNo();
15805 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15806 // Look pass truncate.
15807 UOpNo = User->use_begin().getOperandNo();
15808 User = *User->use_begin();
15811 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15812 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15818 // Emit KTEST instruction for bit vectors on AVX-512
15819 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
15820 const X86Subtarget &Subtarget) {
15821 if (Op.getOpcode() == ISD::BITCAST) {
15822 auto hasKTEST = [&](MVT VT) {
15823 unsigned SizeInBits = VT.getSizeInBits();
15824 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
15825 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
15827 SDValue Op0 = Op.getOperand(0);
15828 MVT Op0VT = Op0.getValueType().getSimpleVT();
15829 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
15831 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
15836 /// Emit nodes that will be selected as "test Op0,Op0", or something
15838 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
15839 SelectionDAG &DAG) const {
15840 if (Op.getValueType() == MVT::i1) {
15841 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15842 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15843 DAG.getConstant(0, dl, MVT::i8));
15845 // CF and OF aren't always set the way we want. Determine which
15846 // of these we need.
15847 bool NeedCF = false;
15848 bool NeedOF = false;
15851 case X86::COND_A: case X86::COND_AE:
15852 case X86::COND_B: case X86::COND_BE:
15855 case X86::COND_G: case X86::COND_GE:
15856 case X86::COND_L: case X86::COND_LE:
15857 case X86::COND_O: case X86::COND_NO: {
15858 // Check if we really need to set the
15859 // Overflow flag. If NoSignedWrap is present
15860 // that is not actually needed.
15861 switch (Op->getOpcode()) {
15866 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
15867 if (BinNode->Flags.hasNoSignedWrap())
15877 // See if we can use the EFLAGS value from the operand instead of
15878 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15879 // we prove that the arithmetic won't overflow, we can't use OF or CF.
15880 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15881 // Emit KTEST for bit vectors
15882 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15884 // Emit a CMP with 0, which is the TEST pattern.
15885 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15886 DAG.getConstant(0, dl, Op.getValueType()));
15888 unsigned Opcode = 0;
15889 unsigned NumOperands = 0;
15891 // Truncate operations may prevent the merge of the SETCC instruction
15892 // and the arithmetic instruction before it. Attempt to truncate the operands
15893 // of the arithmetic instruction and use a reduced bit-width instruction.
15894 bool NeedTruncation = false;
15895 SDValue ArithOp = Op;
15896 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15897 SDValue Arith = Op->getOperand(0);
15898 // Both the trunc and the arithmetic op need to have one user each.
15899 if (Arith->hasOneUse())
15900 switch (Arith.getOpcode()) {
15907 NeedTruncation = true;
15913 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15914 // which may be the result of a CAST. We use the variable 'Op', which is the
15915 // non-casted variable when we check for possible users.
15916 switch (ArithOp.getOpcode()) {
15918 // Due to an isel shortcoming, be conservative if this add is likely to be
15919 // selected as part of a load-modify-store instruction. When the root node
15920 // in a match is a store, isel doesn't know how to remap non-chain non-flag
15921 // uses of other nodes in the match, such as the ADD in this case. This
15922 // leads to the ADD being left around and reselected, with the result being
15923 // two adds in the output. Alas, even if none our users are stores, that
15924 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
15925 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
15926 // climbing the DAG back to the root, and it doesn't seem to be worth the
15928 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15929 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15930 if (UI->getOpcode() != ISD::CopyToReg &&
15931 UI->getOpcode() != ISD::SETCC &&
15932 UI->getOpcode() != ISD::STORE)
15935 if (ConstantSDNode *C =
15936 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
15937 // An add of one will be selected as an INC.
15938 if (C->isOne() && !Subtarget.slowIncDec()) {
15939 Opcode = X86ISD::INC;
15944 // An add of negative one (subtract of one) will be selected as a DEC.
15945 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
15946 Opcode = X86ISD::DEC;
15952 // Otherwise use a regular EFLAGS-setting add.
15953 Opcode = X86ISD::ADD;
15958 // If we have a constant logical shift that's only used in a comparison
15959 // against zero turn it into an equivalent AND. This allows turning it into
15960 // a TEST instruction later.
15961 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15962 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15963 EVT VT = Op.getValueType();
15964 unsigned BitWidth = VT.getSizeInBits();
15965 unsigned ShAmt = Op->getConstantOperandVal(1);
15966 if (ShAmt >= BitWidth) // Avoid undefined shifts.
15968 APInt Mask = ArithOp.getOpcode() == ISD::SRL
15969 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15970 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15971 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15973 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15974 DAG.getConstant(Mask, dl, VT));
15979 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
15980 // because a TEST instruction will be better.
15981 if (!hasNonFlagsUse(Op)) {
15982 SDValue Op0 = ArithOp->getOperand(0);
15983 SDValue Op1 = ArithOp->getOperand(1);
15984 EVT VT = ArithOp.getValueType();
15985 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
15986 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
15988 // But if we can combine this into an ANDN operation, then create an AND
15989 // now and allow it to be pattern matched into an ANDN.
15990 if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
15997 // Due to the ISEL shortcoming noted above, be conservative if this op is
15998 // likely to be selected as part of a load-modify-store instruction.
15999 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16000 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16001 if (UI->getOpcode() == ISD::STORE)
16004 // Otherwise use a regular EFLAGS-setting instruction.
16005 switch (ArithOp.getOpcode()) {
16006 default: llvm_unreachable("unexpected operator!");
16007 case ISD::SUB: Opcode = X86ISD::SUB; break;
16008 case ISD::XOR: Opcode = X86ISD::XOR; break;
16009 case ISD::AND: Opcode = X86ISD::AND; break;
16011 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
16012 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16015 Opcode = X86ISD::OR;
16029 return SDValue(Op.getNode(), 1);
16035 // If we found that truncation is beneficial, perform the truncation and
16037 if (NeedTruncation) {
16038 EVT VT = Op.getValueType();
16039 SDValue WideVal = Op->getOperand(0);
16040 EVT WideVT = WideVal.getValueType();
16041 unsigned ConvertedOp = 0;
16042 // Use a target machine opcode to prevent further DAGCombine
16043 // optimizations that may separate the arithmetic operations
16044 // from the setcc node.
16045 switch (WideVal.getOpcode()) {
16047 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16048 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16049 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16050 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16051 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16055 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16056 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16057 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16058 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16059 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16065 // Emit KTEST for bit vectors
16066 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16069 // Emit a CMP with 0, which is the TEST pattern.
16070 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16071 DAG.getConstant(0, dl, Op.getValueType()));
16073 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16074 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16076 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16077 DAG.ReplaceAllUsesWith(Op, New);
16078 return SDValue(New.getNode(), 1);
16081 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16083 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16084 const SDLoc &dl, SelectionDAG &DAG) const {
16085 if (isNullConstant(Op1))
16086 return EmitTest(Op0, X86CC, dl, DAG);
16088 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16089 "Unexpected comparison operation for MVT::i1 operands");
16091 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16092 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16093 // Only promote the compare up to I32 if it is a 16 bit operation
16094 // with an immediate. 16 bit immediates are to be avoided.
16095 if ((Op0.getValueType() == MVT::i16 &&
16096 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16097 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16098 !Subtarget.isAtom()) {
16099 unsigned ExtendOp =
16100 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16101 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16102 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16104 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16105 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16106 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16108 return SDValue(Sub.getNode(), 1);
16110 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16113 /// Convert a comparison if required by the subtarget.
16114 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16115 SelectionDAG &DAG) const {
16116 // If the subtarget does not support the FUCOMI instruction, floating-point
16117 // comparisons have to be converted.
16118 if (Subtarget.hasCMov() ||
16119 Cmp.getOpcode() != X86ISD::CMP ||
16120 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16121 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16124 // The instruction selector will select an FUCOM instruction instead of
16125 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16126 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16127 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16129 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16130 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16131 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16132 DAG.getConstant(8, dl, MVT::i8));
16133 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16135 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16136 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16137 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16140 /// Check if replacement of SQRT with RSQRT should be disabled.
16141 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16142 EVT VT = Op.getValueType();
16144 // We never want to use both SQRT and RSQRT instructions for the same input.
16145 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16149 return Subtarget.hasFastVectorFSQRT();
16150 return Subtarget.hasFastScalarFSQRT();
16153 /// The minimum architected relative accuracy is 2^-12. We need one
16154 /// Newton-Raphson step to have a good float result (24 bits of precision).
16155 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16156 SelectionDAG &DAG, int Enabled,
16157 int &RefinementSteps,
16158 bool &UseOneConstNR,
16159 bool Reciprocal) const {
16160 EVT VT = Op.getValueType();
16162 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16163 // TODO: Add support for AVX512 (v16f32).
16164 // It is likely not profitable to do this for f64 because a double-precision
16165 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16166 // instructions: convert to single, rsqrtss, convert back to double, refine
16167 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16168 // along with FMA, this could be a throughput win.
16169 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16170 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16171 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16172 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16173 RefinementSteps = 1;
16175 UseOneConstNR = false;
16176 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16181 /// The minimum architected relative accuracy is 2^-12. We need one
16182 /// Newton-Raphson step to have a good float result (24 bits of precision).
16183 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16185 int &RefinementSteps) const {
16186 EVT VT = Op.getValueType();
16188 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16189 // TODO: Add support for AVX512 (v16f32).
16190 // It is likely not profitable to do this for f64 because a double-precision
16191 // reciprocal estimate with refinement on x86 prior to FMA requires
16192 // 15 instructions: convert to single, rcpss, convert back to double, refine
16193 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16194 // along with FMA, this could be a throughput win.
16196 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16197 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16198 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16199 // Enable estimate codegen with 1 refinement step for vector division.
16200 // Scalar division estimates are disabled because they break too much
16201 // real-world code. These defaults are intended to match GCC behavior.
16202 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16205 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16206 RefinementSteps = 1;
16208 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16213 /// If we have at least two divisions that use the same divisor, convert to
16214 /// multplication by a reciprocal. This may need to be adjusted for a given
16215 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16216 /// This is because we still need one division to calculate the reciprocal and
16217 /// then we need two multiplies by that reciprocal as replacements for the
16218 /// original divisions.
16219 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16223 /// Helper for creating a X86ISD::SETCC node.
16224 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16225 SelectionDAG &DAG) {
16226 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16227 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16230 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16231 /// according to equal/not-equal condition code \p CC.
16232 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16233 const SDLoc &dl, SelectionDAG &DAG) {
16234 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16235 // instruction. Since the shift amount is in-range-or-undefined, we know
16236 // that doing a bittest on the i32 value is ok. We extend to i32 because
16237 // the encoding for the i16 version is larger than the i32 version.
16238 // Also promote i16 to i32 for performance / code size reason.
16239 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16240 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16242 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16243 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16244 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16245 // known to be zero.
16246 if (Src.getValueType() == MVT::i64 &&
16247 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16248 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16250 // If the operand types disagree, extend the shift amount to match. Since
16251 // BT ignores high bits (like shifts) we can use anyextend.
16252 if (Src.getValueType() != BitNo.getValueType())
16253 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16255 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16256 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16257 return getSETCC(Cond, BT, dl , DAG);
16260 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16261 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16262 const SDLoc &dl, SelectionDAG &DAG) {
16263 SDValue Op0 = And.getOperand(0);
16264 SDValue Op1 = And.getOperand(1);
16265 if (Op0.getOpcode() == ISD::TRUNCATE)
16266 Op0 = Op0.getOperand(0);
16267 if (Op1.getOpcode() == ISD::TRUNCATE)
16268 Op1 = Op1.getOperand(0);
16271 if (Op1.getOpcode() == ISD::SHL)
16272 std::swap(Op0, Op1);
16273 if (Op0.getOpcode() == ISD::SHL) {
16274 if (isOneConstant(Op0.getOperand(0))) {
16275 // If we looked past a truncate, check that it's only truncating away
16277 unsigned BitWidth = Op0.getValueSizeInBits();
16278 unsigned AndBitWidth = And.getValueSizeInBits();
16279 if (BitWidth > AndBitWidth) {
16281 DAG.computeKnownBits(Op0, Zeros, Ones);
16282 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
16286 RHS = Op0.getOperand(1);
16288 } else if (Op1.getOpcode() == ISD::Constant) {
16289 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16290 uint64_t AndRHSVal = AndRHS->getZExtValue();
16291 SDValue AndLHS = Op0;
16293 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16294 LHS = AndLHS.getOperand(0);
16295 RHS = AndLHS.getOperand(1);
16298 // Use BT if the immediate can't be encoded in a TEST instruction.
16299 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16301 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16306 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16311 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16312 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16313 const SDLoc &dl, SelectionDAG &DAG) {
16315 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16316 "Expected TRUNCATE to i1 node");
16318 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16321 SDValue ShiftRight = Op.getOperand(0);
16322 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16326 /// Result of 'and' or 'trunc to i1' is compared against zero.
16327 /// Change to a BT node if possible.
16328 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16329 const SDLoc &dl, SelectionDAG &DAG) const {
16330 if (Op.getOpcode() == ISD::AND)
16331 return LowerAndToBT(Op, CC, dl, DAG);
16332 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16333 return LowerTruncateToBT(Op, CC, dl, DAG);
16337 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16339 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16344 // SSE Condition code mapping:
16353 switch (SetCCOpcode) {
16354 default: llvm_unreachable("Unexpected SETCC condition");
16356 case ISD::SETEQ: SSECC = 0; break;
16358 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16360 case ISD::SETOLT: SSECC = 1; break;
16362 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16364 case ISD::SETOLE: SSECC = 2; break;
16365 case ISD::SETUO: SSECC = 3; break;
16367 case ISD::SETNE: SSECC = 4; break;
16368 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16369 case ISD::SETUGE: SSECC = 5; break;
16370 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16371 case ISD::SETUGT: SSECC = 6; break;
16372 case ISD::SETO: SSECC = 7; break;
16374 case ISD::SETONE: SSECC = 8; break;
16377 std::swap(Op0, Op1);
16382 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16383 /// concatenate the result back.
16384 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16385 MVT VT = Op.getSimpleValueType();
16387 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16388 "Unsupported value type for operation");
16390 unsigned NumElems = VT.getVectorNumElements();
16392 SDValue CC = Op.getOperand(2);
16394 // Extract the LHS vectors
16395 SDValue LHS = Op.getOperand(0);
16396 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16397 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16399 // Extract the RHS vectors
16400 SDValue RHS = Op.getOperand(1);
16401 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16402 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16404 // Issue the operation on the smaller types and concatenate the result back
16405 MVT EltVT = VT.getVectorElementType();
16406 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16407 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16408 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16409 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16412 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16413 SDValue Op0 = Op.getOperand(0);
16414 SDValue Op1 = Op.getOperand(1);
16415 SDValue CC = Op.getOperand(2);
16416 MVT VT = Op.getSimpleValueType();
16419 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16420 "Unexpected type for boolean compare operation");
16421 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16422 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16423 DAG.getConstant(-1, dl, VT));
16424 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16425 DAG.getConstant(-1, dl, VT));
16426 switch (SetCCOpcode) {
16427 default: llvm_unreachable("Unexpected SETCC condition");
16429 // (x == y) -> ~(x ^ y)
16430 return DAG.getNode(ISD::XOR, dl, VT,
16431 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16432 DAG.getConstant(-1, dl, VT));
16434 // (x != y) -> (x ^ y)
16435 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16438 // (x > y) -> (x & ~y)
16439 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16442 // (x < y) -> (~x & y)
16443 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16446 // (x <= y) -> (~x | y)
16447 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16450 // (x >=y) -> (x | ~y)
16451 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16455 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16457 SDValue Op0 = Op.getOperand(0);
16458 SDValue Op1 = Op.getOperand(1);
16459 SDValue CC = Op.getOperand(2);
16460 MVT VT = Op.getSimpleValueType();
16463 assert(VT.getVectorElementType() == MVT::i1 &&
16464 "Cannot set masked compare for this operation");
16466 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16468 bool Unsigned = false;
16471 switch (SetCCOpcode) {
16472 default: llvm_unreachable("Unexpected SETCC condition");
16473 case ISD::SETNE: SSECC = 4; break;
16474 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16475 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16476 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16477 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16478 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16479 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16480 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16481 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16482 case ISD::SETLE: SSECC = 2; break;
16486 std::swap(Op0, Op1);
16488 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16489 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16490 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16491 DAG.getConstant(SSECC, dl, MVT::i8));
16494 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16495 /// operand \p Op1. If non-trivial (for example because it's not constant)
16496 /// return an empty value.
16497 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16498 SelectionDAG &DAG) {
16499 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16503 MVT VT = Op1.getSimpleValueType();
16504 MVT EVT = VT.getVectorElementType();
16505 unsigned n = VT.getVectorNumElements();
16506 SmallVector<SDValue, 8> ULTOp1;
16508 for (unsigned i = 0; i < n; ++i) {
16509 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16510 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16513 // Avoid underflow.
16514 APInt Val = Elt->getAPIntValue();
16518 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16521 return DAG.getBuildVector(VT, dl, ULTOp1);
16524 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16525 SelectionDAG &DAG) {
16526 SDValue Op0 = Op.getOperand(0);
16527 SDValue Op1 = Op.getOperand(1);
16528 SDValue CC = Op.getOperand(2);
16529 MVT VT = Op.getSimpleValueType();
16530 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16531 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
16536 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
16537 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
16541 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
16542 assert(VT.getVectorNumElements() <= 16);
16543 Opc = X86ISD::CMPM;
16545 Opc = X86ISD::CMPP;
16546 // The SSE/AVX packed FP comparison nodes are defined with a
16547 // floating-point vector result that matches the operand type. This allows
16548 // them to work with an SSE1 target (integer vector types are not legal).
16549 VT = Op0.getSimpleValueType();
16552 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
16553 // emit two comparisons and a logic op to tie them together.
16554 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
16557 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
16559 // LLVM predicate is SETUEQ or SETONE.
16561 unsigned CombineOpc;
16562 if (SetCCOpcode == ISD::SETUEQ) {
16565 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
16566 static_cast<unsigned>(ISD::OR);
16568 assert(SetCCOpcode == ISD::SETONE);
16571 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
16572 static_cast<unsigned>(ISD::AND);
16575 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16576 DAG.getConstant(CC0, dl, MVT::i8));
16577 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16578 DAG.getConstant(CC1, dl, MVT::i8));
16579 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
16581 // Handle all other FP comparisons here.
16582 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
16583 DAG.getConstant(SSECC, dl, MVT::i8));
16586 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
16587 // result type of SETCC. The bitcast is expected to be optimized away
16588 // during combining/isel.
16589 if (Opc == X86ISD::CMPP)
16590 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
16595 MVT VTOp0 = Op0.getSimpleValueType();
16596 assert(VTOp0 == Op1.getSimpleValueType() &&
16597 "Expected operands with same type!");
16598 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
16599 "Invalid number of packed elements for source and destination!");
16601 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
16602 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
16603 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
16604 // legalizer firstly checks if the first operand in input to the setcc has
16605 // a legal type. If so, then it promotes the return type to that same type.
16606 // Otherwise, the return type is promoted to the 'next legal type' which,
16607 // for a vector of MVT::i1 is always a 128-bit integer vector type.
16609 // We reach this code only if the following two conditions are met:
16610 // 1. Both return type and operand type have been promoted to wider types
16611 // by the type legalizer.
16612 // 2. The original operand type has been promoted to a 256-bit vector.
16614 // Note that condition 2. only applies for AVX targets.
16615 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
16616 return DAG.getZExtOrTrunc(NewOp, dl, VT);
16619 // The non-AVX512 code below works under the assumption that source and
16620 // destination types are the same.
16621 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
16622 "Value types for source and destination must be the same!");
16624 // Break 256-bit integer vector compare into smaller ones.
16625 if (VT.is256BitVector() && !Subtarget.hasInt256())
16626 return Lower256IntVSETCC(Op, DAG);
16628 // Operands are boolean (vectors of i1)
16629 MVT OpVT = Op1.getSimpleValueType();
16630 if (OpVT.getVectorElementType() == MVT::i1)
16631 return LowerBoolVSETCC_AVX512(Op, DAG);
16633 // The result is boolean, but operands are int/float
16634 if (VT.getVectorElementType() == MVT::i1) {
16635 // In AVX-512 architecture setcc returns mask with i1 elements,
16636 // But there is no compare instruction for i8 and i16 elements in KNL.
16637 // In this case use SSE compare
16638 bool UseAVX512Inst =
16639 (OpVT.is512BitVector() ||
16640 OpVT.getScalarSizeInBits() >= 32 ||
16641 (Subtarget.hasBWI() && Subtarget.hasVLX()));
16644 return LowerIntVSETCC_AVX512(Op, DAG);
16646 return DAG.getNode(ISD::TRUNCATE, dl, VT,
16647 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
16650 // Lower using XOP integer comparisons.
16651 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
16652 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
16653 // Translate compare code to XOP PCOM compare mode.
16654 unsigned CmpMode = 0;
16655 switch (SetCCOpcode) {
16656 default: llvm_unreachable("Unexpected SETCC condition");
16658 case ISD::SETLT: CmpMode = 0x00; break;
16660 case ISD::SETLE: CmpMode = 0x01; break;
16662 case ISD::SETGT: CmpMode = 0x02; break;
16664 case ISD::SETGE: CmpMode = 0x03; break;
16665 case ISD::SETEQ: CmpMode = 0x04; break;
16666 case ISD::SETNE: CmpMode = 0x05; break;
16669 // Are we comparing unsigned or signed integers?
16670 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
16671 ? X86ISD::VPCOMU : X86ISD::VPCOM;
16673 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16674 DAG.getConstant(CmpMode, dl, MVT::i8));
16677 // We are handling one of the integer comparisons here. Since SSE only has
16678 // GT and EQ comparisons for integer, swapping operands and multiple
16679 // operations may be required for some comparisons.
16681 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
16682 bool Subus = false;
16684 switch (SetCCOpcode) {
16685 default: llvm_unreachable("Unexpected SETCC condition");
16686 case ISD::SETNE: Invert = true;
16687 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
16688 case ISD::SETLT: Swap = true;
16689 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
16690 case ISD::SETGE: Swap = true;
16691 case ISD::SETLE: Opc = X86ISD::PCMPGT;
16692 Invert = true; break;
16693 case ISD::SETULT: Swap = true;
16694 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
16695 FlipSigns = true; break;
16696 case ISD::SETUGE: Swap = true;
16697 case ISD::SETULE: Opc = X86ISD::PCMPGT;
16698 FlipSigns = true; Invert = true; break;
16701 // Special case: Use min/max operations for SETULE/SETUGE
16702 MVT VET = VT.getVectorElementType();
16704 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
16705 || (Subtarget.hasSSE2() && (VET == MVT::i8));
16708 switch (SetCCOpcode) {
16710 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
16711 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
16714 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
16717 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
16718 if (!MinMax && hasSubus) {
16719 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
16721 // t = psubus Op0, Op1
16722 // pcmpeq t, <0..0>
16723 switch (SetCCOpcode) {
16725 case ISD::SETULT: {
16726 // If the comparison is against a constant we can turn this into a
16727 // setule. With psubus, setule does not require a swap. This is
16728 // beneficial because the constant in the register is no longer
16729 // destructed as the destination so it can be hoisted out of a loop.
16730 // Only do this pre-AVX since vpcmp* is no longer destructive.
16731 if (Subtarget.hasAVX())
16733 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
16735 Subus = true; Invert = false; Swap = false;
16739 // Psubus is better than flip-sign because it requires no inversion.
16740 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
16741 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
16745 Opc = X86ISD::SUBUS;
16751 std::swap(Op0, Op1);
16753 // Check that the operation in question is available (most are plain SSE2,
16754 // but PCMPGTQ and PCMPEQQ have different requirements).
16755 if (VT == MVT::v2i64) {
16756 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
16757 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
16759 // First cast everything to the right type.
16760 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16761 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16763 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16764 // bits of the inputs before performing those operations. The lower
16765 // compare is always unsigned.
16768 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
16770 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
16771 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
16772 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
16774 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
16775 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
16777 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
16778 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
16779 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
16781 // Create masks for only the low parts/high parts of the 64 bit integers.
16782 static const int MaskHi[] = { 1, 1, 3, 3 };
16783 static const int MaskLo[] = { 0, 0, 2, 2 };
16784 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
16785 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
16786 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
16788 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
16789 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
16792 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16794 return DAG.getBitcast(VT, Result);
16797 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
16798 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
16799 // pcmpeqd + pshufd + pand.
16800 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
16802 // First cast everything to the right type.
16803 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16804 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16807 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
16809 // Make sure the lower and upper halves are both all-ones.
16810 static const int Mask[] = { 1, 0, 3, 2 };
16811 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
16812 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
16815 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16817 return DAG.getBitcast(VT, Result);
16821 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16822 // bits of the inputs before performing those operations.
16824 MVT EltVT = VT.getVectorElementType();
16825 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
16827 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
16828 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
16831 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
16833 // If the logical-not of the result is required, perform that now.
16835 Result = DAG.getNOT(dl, Result, VT);
16838 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
16841 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
16842 getZeroVector(VT, Subtarget, DAG, dl));
16847 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16849 MVT VT = Op.getSimpleValueType();
16851 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
16853 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
16854 && "SetCC type must be 8-bit or 1-bit integer");
16855 SDValue Op0 = Op.getOperand(0);
16856 SDValue Op1 = Op.getOperand(1);
16858 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16860 // Optimize to BT if possible.
16861 // Lower (X & (1 << N)) == 0 to BT(X, N).
16862 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
16863 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
16864 // Lower (trunc (X >> N) to i1) to BT(X, N).
16865 if (Op0.hasOneUse() && isNullConstant(Op1) &&
16866 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16867 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
16869 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
16874 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
16876 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
16877 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16879 // If the input is a setcc, then reuse the input setcc or use a new one with
16880 // the inverted condition.
16881 if (Op0.getOpcode() == X86ISD::SETCC) {
16882 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
16883 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
16887 CCode = X86::GetOppositeBranchCondition(CCode);
16888 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
16890 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16894 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16895 if (isOneConstant(Op1)) {
16896 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
16897 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
16899 if (!isNullConstant(Op1)) {
16900 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
16901 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
16905 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
16906 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
16907 if (X86CC == X86::COND_INVALID)
16910 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
16911 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
16912 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
16914 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16918 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
16919 SDValue LHS = Op.getOperand(0);
16920 SDValue RHS = Op.getOperand(1);
16921 SDValue Carry = Op.getOperand(2);
16922 SDValue Cond = Op.getOperand(3);
16925 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
16926 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
16928 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
16929 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16930 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
16931 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
16932 if (Op.getSimpleValueType() == MVT::i1)
16933 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
16937 /// Return true if opcode is a X86 logical comparison.
16938 static bool isX86LogicalCmp(SDValue Op) {
16939 unsigned Opc = Op.getOpcode();
16940 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
16941 Opc == X86ISD::SAHF)
16943 if (Op.getResNo() == 1 &&
16944 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
16945 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
16946 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
16947 Opc == X86ISD::XOR || Opc == X86ISD::AND))
16950 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
16956 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
16957 if (V.getOpcode() != ISD::TRUNCATE)
16960 SDValue VOp0 = V.getOperand(0);
16961 unsigned InBits = VOp0.getValueSizeInBits();
16962 unsigned Bits = V.getValueSizeInBits();
16963 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
16966 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16967 bool AddTest = true;
16968 SDValue Cond = Op.getOperand(0);
16969 SDValue Op1 = Op.getOperand(1);
16970 SDValue Op2 = Op.getOperand(2);
16972 MVT VT = Op1.getSimpleValueType();
16975 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
16976 // are available or VBLENDV if AVX is available.
16977 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
16978 if (Cond.getOpcode() == ISD::SETCC &&
16979 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
16980 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
16981 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
16982 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
16983 int SSECC = translateX86FSETCC(
16984 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
16987 if (Subtarget.hasAVX512()) {
16988 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
16989 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
16990 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
16991 DL, VT, Cmp, Op1, Op2);
16994 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16995 DAG.getConstant(SSECC, DL, MVT::i8));
16997 // If we have AVX, we can use a variable vector select (VBLENDV) instead
16998 // of 3 logic instructions for size savings and potentially speed.
16999 // Unfortunately, there is no scalar form of VBLENDV.
17001 // If either operand is a constant, don't try this. We can expect to
17002 // optimize away at least one of the logic instructions later in that
17003 // case, so that sequence would be faster than a variable blend.
17005 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17006 // uses XMM0 as the selection register. That may need just as many
17007 // instructions as the AND/ANDN/OR sequence due to register moves, so
17010 if (Subtarget.hasAVX() &&
17011 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17013 // Convert to vectors, do a VSELECT, and convert back to scalar.
17014 // All of the conversions should be optimized away.
17016 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17017 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17018 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17019 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17021 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17022 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17024 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
17026 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17027 VSel, DAG.getIntPtrConstant(0, DL));
17029 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17030 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17031 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17035 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17036 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
17037 Subtarget.hasAVX512())
17038 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
17040 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17042 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17043 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17044 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17045 Op1Scalar = Op1.getOperand(0);
17047 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17048 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17049 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17050 Op2Scalar = Op2.getOperand(0);
17051 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17052 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
17053 Op1Scalar.getValueType(),
17054 Cond, Op1Scalar, Op2Scalar);
17055 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17056 return DAG.getBitcast(VT, newSelect);
17057 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17058 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17059 DAG.getIntPtrConstant(0, DL));
17063 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17064 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17065 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17066 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17067 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17068 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17069 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
17071 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17074 if (Cond.getOpcode() == ISD::SETCC) {
17075 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17077 // If the condition was updated, it's possible that the operands of the
17078 // select were also updated (for example, EmitTest has a RAUW). Refresh
17079 // the local references to the select operands in case they got stale.
17080 Op1 = Op.getOperand(1);
17081 Op2 = Op.getOperand(2);
17085 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17086 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17087 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17088 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17089 if (Cond.getOpcode() == X86ISD::SETCC &&
17090 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17091 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17092 SDValue Cmp = Cond.getOperand(1);
17094 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17096 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17097 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17098 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17100 SDValue CmpOp0 = Cmp.getOperand(0);
17101 // Apply further optimizations for special cases
17102 // (select (x != 0), -1, 0) -> neg & sbb
17103 // (select (x == 0), 0, -1) -> neg & sbb
17104 if (isNullConstant(Y) &&
17105 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17106 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17107 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17108 DAG.getConstant(0, DL,
17109 CmpOp0.getValueType()),
17111 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17112 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17113 SDValue(Neg.getNode(), 1));
17117 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17118 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17119 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17121 SDValue Res = // Res = 0 or -1.
17122 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17123 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17125 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17126 Res = DAG.getNOT(DL, Res, Res.getValueType());
17128 if (!isNullConstant(Op2))
17129 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17134 // Look past (and (setcc_carry (cmp ...)), 1).
17135 if (Cond.getOpcode() == ISD::AND &&
17136 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17137 isOneConstant(Cond.getOperand(1)))
17138 Cond = Cond.getOperand(0);
17140 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17141 // setting operand in place of the X86ISD::SETCC.
17142 unsigned CondOpcode = Cond.getOpcode();
17143 if (CondOpcode == X86ISD::SETCC ||
17144 CondOpcode == X86ISD::SETCC_CARRY) {
17145 CC = Cond.getOperand(0);
17147 SDValue Cmp = Cond.getOperand(1);
17148 unsigned Opc = Cmp.getOpcode();
17149 MVT VT = Op.getSimpleValueType();
17151 bool IllegalFPCMov = false;
17152 if (VT.isFloatingPoint() && !VT.isVector() &&
17153 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17154 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17156 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17157 Opc == X86ISD::BT) { // FIXME
17161 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17162 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17163 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17164 Cond.getOperand(0).getValueType() != MVT::i8)) {
17165 SDValue LHS = Cond.getOperand(0);
17166 SDValue RHS = Cond.getOperand(1);
17167 unsigned X86Opcode;
17170 switch (CondOpcode) {
17171 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17172 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17173 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17174 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17175 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17176 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17177 default: llvm_unreachable("unexpected overflowing operator");
17179 if (CondOpcode == ISD::UMULO)
17180 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17183 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17185 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17187 if (CondOpcode == ISD::UMULO)
17188 Cond = X86Op.getValue(2);
17190 Cond = X86Op.getValue(1);
17192 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17197 // Look past the truncate if the high bits are known zero.
17198 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17199 Cond = Cond.getOperand(0);
17201 // We know the result of AND is compared against zero. Try to match
17203 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17204 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17205 CC = NewSetCC.getOperand(0);
17206 Cond = NewSetCC.getOperand(1);
17213 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17214 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17217 // a < b ? -1 : 0 -> RES = ~setcc_carry
17218 // a < b ? 0 : -1 -> RES = setcc_carry
17219 // a >= b ? -1 : 0 -> RES = setcc_carry
17220 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17221 if (Cond.getOpcode() == X86ISD::SUB) {
17222 Cond = ConvertCmpIfNecessary(Cond, DAG);
17223 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17225 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17226 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17227 (isNullConstant(Op1) || isNullConstant(Op2))) {
17228 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17229 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17231 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17232 return DAG.getNOT(DL, Res, Res.getValueType());
17237 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17238 // widen the cmov and push the truncate through. This avoids introducing a new
17239 // branch during isel and doesn't add any extensions.
17240 if (Op.getValueType() == MVT::i8 &&
17241 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17242 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17243 if (T1.getValueType() == T2.getValueType() &&
17244 // Blacklist CopyFromReg to avoid partial register stalls.
17245 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17246 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17247 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17248 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17252 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17253 // condition is true.
17254 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17255 SDValue Ops[] = { Op2, Op1, CC, Cond };
17256 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17259 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17260 const X86Subtarget &Subtarget,
17261 SelectionDAG &DAG) {
17262 MVT VT = Op->getSimpleValueType(0);
17263 SDValue In = Op->getOperand(0);
17264 MVT InVT = In.getSimpleValueType();
17265 MVT VTElt = VT.getVectorElementType();
17266 MVT InVTElt = InVT.getVectorElementType();
17270 if ((InVTElt == MVT::i1) &&
17271 (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
17272 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
17274 ((Subtarget.hasBWI() && VT.is512BitVector() &&
17275 VTElt.getSizeInBits() <= 16)) ||
17277 ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
17278 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
17280 ((Subtarget.hasDQI() && VT.is512BitVector() &&
17281 VTElt.getSizeInBits() >= 32))))
17282 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17284 unsigned NumElts = VT.getVectorNumElements();
17286 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
17289 if (VT.is512BitVector() && InVTElt != MVT::i1) {
17290 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17291 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
17292 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17295 assert (InVTElt == MVT::i1 && "Unexpected vector type");
17296 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17298 if (Subtarget.hasDQI()) {
17299 V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
17300 assert(!VT.is512BitVector() && "Unexpected vector type");
17302 SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
17303 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17304 V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17305 if (VT.is512BitVector())
17309 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17312 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17313 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17314 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17315 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17316 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17317 const X86Subtarget &Subtarget,
17318 SelectionDAG &DAG) {
17319 SDValue In = Op->getOperand(0);
17320 MVT VT = Op->getSimpleValueType(0);
17321 MVT InVT = In.getSimpleValueType();
17322 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17324 MVT SVT = VT.getVectorElementType();
17325 MVT InSVT = InVT.getVectorElementType();
17326 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17328 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17330 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17332 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17333 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17334 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17339 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17340 // For 512-bit vectors, we need 128-bits or 256-bits.
17341 if (VT.getSizeInBits() > 128) {
17342 // Input needs to be at least the same number of elements as output, and
17343 // at least 128-bits.
17344 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17345 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17348 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17349 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17351 // SSE41 targets can use the pmovsx* instructions directly.
17352 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17353 X86ISD::VSEXT : X86ISD::VZEXT;
17354 if (Subtarget.hasSSE41())
17355 return DAG.getNode(ExtOpc, dl, VT, In);
17357 // We should only get here for sign extend.
17358 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17359 "Unexpected opcode!");
17361 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17365 // As SRAI is only available on i16/i32 types, we expand only up to i32
17366 // and handle i64 separately.
17367 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17368 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17369 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17370 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17371 Curr = DAG.getBitcast(CurrVT, Curr);
17374 SDValue SignExt = Curr;
17375 if (CurrVT != InVT) {
17376 unsigned SignExtShift =
17377 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17378 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17379 DAG.getConstant(SignExtShift, dl, MVT::i8));
17385 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17386 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17387 DAG.getConstant(31, dl, MVT::i8));
17388 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17389 return DAG.getBitcast(VT, Ext);
17395 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17396 SelectionDAG &DAG) {
17397 MVT VT = Op->getSimpleValueType(0);
17398 SDValue In = Op->getOperand(0);
17399 MVT InVT = In.getSimpleValueType();
17402 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17403 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17405 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17406 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17407 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17410 if (Subtarget.hasInt256())
17411 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17413 // Optimize vectors in AVX mode
17414 // Sign extend v8i16 to v8i32 and
17417 // Divide input vector into two parts
17418 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17419 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17420 // concat the vectors to original VT
17422 unsigned NumElems = InVT.getVectorNumElements();
17423 SDValue Undef = DAG.getUNDEF(InVT);
17425 SmallVector<int,8> ShufMask1(NumElems, -1);
17426 for (unsigned i = 0; i != NumElems/2; ++i)
17429 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17431 SmallVector<int,8> ShufMask2(NumElems, -1);
17432 for (unsigned i = 0; i != NumElems/2; ++i)
17433 ShufMask2[i] = i + NumElems/2;
17435 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17437 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17438 VT.getVectorNumElements() / 2);
17440 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
17441 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
17443 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17446 // Lower truncating store. We need a special lowering to vXi1 vectors
17447 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17448 SelectionDAG &DAG) {
17449 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17451 EVT MemVT = St->getMemoryVT();
17452 assert(St->isTruncatingStore() && "We only custom truncating store.");
17453 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17454 "Expected truncstore of i1 vector");
17456 SDValue Op = St->getValue();
17457 MVT OpVT = Op.getValueType().getSimpleVT();
17458 unsigned NumElts = OpVT.getVectorNumElements();
17459 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17461 // Truncate and store - everything is legal
17462 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17463 if (MemVT.getSizeInBits() < 8)
17464 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17465 DAG.getUNDEF(MVT::v8i1), Op,
17466 DAG.getIntPtrConstant(0, dl));
17467 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17468 St->getMemOperand());
17471 // A subset, assume that we have only AVX-512F
17472 if (NumElts <= 8) {
17474 // Extend to 8-elts vector
17475 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17476 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17477 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17479 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17480 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17481 St->getMemOperand());
17484 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17485 // Divide the vector into 2 parts and store each part separately
17486 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17487 DAG.getIntPtrConstant(0, dl));
17488 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
17489 SDValue BasePtr = St->getBasePtr();
17490 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
17491 St->getMemOperand());
17492 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17493 DAG.getIntPtrConstant(16, dl));
17494 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
17496 SDValue BasePtrHi =
17497 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17498 DAG.getConstant(2, dl, BasePtr.getValueType()));
17500 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
17501 BasePtrHi, St->getMemOperand());
17502 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
17505 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
17506 const X86Subtarget &Subtarget,
17507 SelectionDAG &DAG) {
17509 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17511 EVT MemVT = Ld->getMemoryVT();
17512 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
17513 "Expected i1 vector load");
17514 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
17515 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17516 MVT VT = Op.getValueType().getSimpleVT();
17517 unsigned NumElts = VT.getVectorNumElements();
17519 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17521 // Load and extend - everything is legal
17523 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
17525 Ld->getMemOperand());
17526 // Replace chain users with the new chain.
17527 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17528 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17529 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17530 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
17532 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17533 DAG.getIntPtrConstant(0, dl));
17535 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
17537 Ld->getMemOperand());
17538 // Replace chain users with the new chain.
17539 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17540 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17542 // Finally, do a normal sign-extend to the desired register.
17543 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
17546 if (NumElts <= 8) {
17547 // A subset, assume that we have only AVX-512F
17548 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
17549 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
17550 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
17552 Ld->getMemOperand());
17553 // Replace chain users with the new chain.
17554 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17555 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17557 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
17558 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
17561 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
17563 // we should take care to v4i1 and v2i1
17565 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17566 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
17567 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17568 DAG.getIntPtrConstant(0, dl));
17571 assert(VT == MVT::v32i8 && "Unexpected extload type");
17573 SmallVector<SDValue, 2> Chains;
17575 SDValue BasePtr = Ld->getBasePtr();
17576 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17578 Ld->getMemOperand());
17579 Chains.push_back(LoadLo.getValue(1));
17581 SDValue BasePtrHi =
17582 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17583 DAG.getConstant(2, dl, BasePtr.getValueType()));
17585 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17587 Ld->getMemOperand());
17588 Chains.push_back(LoadHi.getValue(1));
17589 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17590 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
17592 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
17593 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
17594 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
17597 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
17598 // may emit an illegal shuffle but the expansion is still better than scalar
17599 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
17600 // we'll emit a shuffle and a arithmetic shift.
17601 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
17602 // TODO: It is possible to support ZExt by zeroing the undef values during
17603 // the shuffle phase or after the shuffle.
17604 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
17605 SelectionDAG &DAG) {
17606 MVT RegVT = Op.getSimpleValueType();
17607 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
17608 assert(RegVT.isInteger() &&
17609 "We only custom lower integer vector sext loads.");
17611 // Nothing useful we can do without SSE2 shuffles.
17612 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
17614 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17616 EVT MemVT = Ld->getMemoryVT();
17617 if (MemVT.getScalarType() == MVT::i1)
17618 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
17620 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17621 unsigned RegSz = RegVT.getSizeInBits();
17623 ISD::LoadExtType Ext = Ld->getExtensionType();
17625 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
17626 && "Only anyext and sext are currently implemented.");
17627 assert(MemVT != RegVT && "Cannot extend to the same type");
17628 assert(MemVT.isVector() && "Must load a vector from memory");
17630 unsigned NumElems = RegVT.getVectorNumElements();
17631 unsigned MemSz = MemVT.getSizeInBits();
17632 assert(RegSz > MemSz && "Register size must be greater than the mem size");
17634 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
17635 // The only way in which we have a legal 256-bit vector result but not the
17636 // integer 256-bit operations needed to directly lower a sextload is if we
17637 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
17638 // a 128-bit vector and a normal sign_extend to 256-bits that should get
17639 // correctly legalized. We do this late to allow the canonical form of
17640 // sextload to persist throughout the rest of the DAG combiner -- it wants
17641 // to fold together any extensions it can, and so will fuse a sign_extend
17642 // of an sextload into a sextload targeting a wider value.
17644 if (MemSz == 128) {
17645 // Just switch this to a normal load.
17646 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
17647 "it must be a legal 128-bit vector "
17649 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
17650 Ld->getPointerInfo(), Ld->getAlignment(),
17651 Ld->getMemOperand()->getFlags());
17653 assert(MemSz < 128 &&
17654 "Can't extend a type wider than 128 bits to a 256 bit vector!");
17655 // Do an sext load to a 128-bit vector type. We want to use the same
17656 // number of elements, but elements half as wide. This will end up being
17657 // recursively lowered by this routine, but will succeed as we definitely
17658 // have all the necessary features if we're using AVX1.
17660 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
17661 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
17663 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
17664 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
17665 Ld->getMemOperand()->getFlags());
17668 // Replace chain users with the new chain.
17669 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17670 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17672 // Finally, do a normal sign-extend to the desired register.
17673 return DAG.getSExtOrTrunc(Load, dl, RegVT);
17676 // All sizes must be a power of two.
17677 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
17678 "Non-power-of-two elements are not custom lowered!");
17680 // Attempt to load the original value using scalar loads.
17681 // Find the largest scalar type that divides the total loaded size.
17682 MVT SclrLoadTy = MVT::i8;
17683 for (MVT Tp : MVT::integer_valuetypes()) {
17684 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
17689 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
17690 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
17692 SclrLoadTy = MVT::f64;
17694 // Calculate the number of scalar loads that we need to perform
17695 // in order to load our vector from memory.
17696 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
17698 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
17699 "Can only lower sext loads with a single scalar load!");
17701 unsigned loadRegZize = RegSz;
17702 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
17705 // Represent our vector as a sequence of elements which are the
17706 // largest scalar that we can load.
17707 EVT LoadUnitVecVT = EVT::getVectorVT(
17708 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
17710 // Represent the data using the same element type that is stored in
17711 // memory. In practice, we ''widen'' MemVT.
17713 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
17714 loadRegZize / MemVT.getScalarSizeInBits());
17716 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
17717 "Invalid vector type");
17719 // We can't shuffle using an illegal type.
17720 assert(TLI.isTypeLegal(WideVecVT) &&
17721 "We only lower types that form legal widened vector types");
17723 SmallVector<SDValue, 8> Chains;
17724 SDValue Ptr = Ld->getBasePtr();
17725 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
17726 TLI.getPointerTy(DAG.getDataLayout()));
17727 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
17729 for (unsigned i = 0; i < NumLoads; ++i) {
17730 // Perform a single load.
17731 SDValue ScalarLoad =
17732 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
17733 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
17734 Chains.push_back(ScalarLoad.getValue(1));
17735 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
17736 // another round of DAGCombining.
17738 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
17740 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
17741 ScalarLoad, DAG.getIntPtrConstant(i, dl));
17743 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17746 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17748 // Bitcast the loaded value to a vector of the original element type, in
17749 // the size of the target vector type.
17750 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
17751 unsigned SizeRatio = RegSz / MemSz;
17753 if (Ext == ISD::SEXTLOAD) {
17754 // If we have SSE4.1, we can directly emit a VSEXT node.
17755 if (Subtarget.hasSSE41()) {
17756 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
17757 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17761 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
17763 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
17764 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
17766 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
17767 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17771 // Redistribute the loaded elements into the different locations.
17772 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
17773 for (unsigned i = 0; i != NumElems; ++i)
17774 ShuffleVec[i * SizeRatio] = i;
17776 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
17777 DAG.getUNDEF(WideVecVT), ShuffleVec);
17779 // Bitcast to the requested type.
17780 Shuff = DAG.getBitcast(RegVT, Shuff);
17781 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17785 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
17786 /// each of which has no other use apart from the AND / OR.
17787 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
17788 Opc = Op.getOpcode();
17789 if (Opc != ISD::OR && Opc != ISD::AND)
17791 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17792 Op.getOperand(0).hasOneUse() &&
17793 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
17794 Op.getOperand(1).hasOneUse());
17797 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
17798 /// SETCC node has a single use.
17799 static bool isXor1OfSetCC(SDValue Op) {
17800 if (Op.getOpcode() != ISD::XOR)
17802 if (isOneConstant(Op.getOperand(1)))
17803 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17804 Op.getOperand(0).hasOneUse();
17808 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
17809 bool addTest = true;
17810 SDValue Chain = Op.getOperand(0);
17811 SDValue Cond = Op.getOperand(1);
17812 SDValue Dest = Op.getOperand(2);
17815 bool Inverted = false;
17817 if (Cond.getOpcode() == ISD::SETCC) {
17818 // Check for setcc([su]{add,sub,mul}o == 0).
17819 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
17820 isNullConstant(Cond.getOperand(1)) &&
17821 Cond.getOperand(0).getResNo() == 1 &&
17822 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
17823 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
17824 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
17825 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
17826 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
17827 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
17829 Cond = Cond.getOperand(0);
17831 if (SDValue NewCond = LowerSETCC(Cond, DAG))
17836 // FIXME: LowerXALUO doesn't handle these!!
17837 else if (Cond.getOpcode() == X86ISD::ADD ||
17838 Cond.getOpcode() == X86ISD::SUB ||
17839 Cond.getOpcode() == X86ISD::SMUL ||
17840 Cond.getOpcode() == X86ISD::UMUL)
17841 Cond = LowerXALUO(Cond, DAG);
17844 // Look pass (and (setcc_carry (cmp ...)), 1).
17845 if (Cond.getOpcode() == ISD::AND &&
17846 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17847 isOneConstant(Cond.getOperand(1)))
17848 Cond = Cond.getOperand(0);
17850 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17851 // setting operand in place of the X86ISD::SETCC.
17852 unsigned CondOpcode = Cond.getOpcode();
17853 if (CondOpcode == X86ISD::SETCC ||
17854 CondOpcode == X86ISD::SETCC_CARRY) {
17855 CC = Cond.getOperand(0);
17857 SDValue Cmp = Cond.getOperand(1);
17858 unsigned Opc = Cmp.getOpcode();
17859 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
17860 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
17864 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
17868 // These can only come from an arithmetic instruction with overflow,
17869 // e.g. SADDO, UADDO.
17870 Cond = Cond.getOperand(1);
17876 CondOpcode = Cond.getOpcode();
17877 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17878 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17879 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17880 Cond.getOperand(0).getValueType() != MVT::i8)) {
17881 SDValue LHS = Cond.getOperand(0);
17882 SDValue RHS = Cond.getOperand(1);
17883 unsigned X86Opcode;
17886 // Keep this in sync with LowerXALUO, otherwise we might create redundant
17887 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
17889 switch (CondOpcode) {
17890 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17892 if (isOneConstant(RHS)) {
17893 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
17896 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17897 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17899 if (isOneConstant(RHS)) {
17900 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
17903 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17904 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17905 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17906 default: llvm_unreachable("unexpected overflowing operator");
17909 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
17910 if (CondOpcode == ISD::UMULO)
17911 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17914 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17916 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
17918 if (CondOpcode == ISD::UMULO)
17919 Cond = X86Op.getValue(2);
17921 Cond = X86Op.getValue(1);
17923 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
17927 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
17928 SDValue Cmp = Cond.getOperand(0).getOperand(1);
17929 if (CondOpc == ISD::OR) {
17930 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
17931 // two branches instead of an explicit OR instruction with a
17933 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17934 isX86LogicalCmp(Cmp)) {
17935 CC = Cond.getOperand(0).getOperand(0);
17936 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17937 Chain, Dest, CC, Cmp);
17938 CC = Cond.getOperand(1).getOperand(0);
17942 } else { // ISD::AND
17943 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
17944 // two branches instead of an explicit AND instruction with a
17945 // separate test. However, we only do this if this block doesn't
17946 // have a fall-through edge, because this requires an explicit
17947 // jmp when the condition is false.
17948 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17949 isX86LogicalCmp(Cmp) &&
17950 Op.getNode()->hasOneUse()) {
17951 X86::CondCode CCode =
17952 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17953 CCode = X86::GetOppositeBranchCondition(CCode);
17954 CC = DAG.getConstant(CCode, dl, MVT::i8);
17955 SDNode *User = *Op.getNode()->use_begin();
17956 // Look for an unconditional branch following this conditional branch.
17957 // We need this because we need to reverse the successors in order
17958 // to implement FCMP_OEQ.
17959 if (User->getOpcode() == ISD::BR) {
17960 SDValue FalseBB = User->getOperand(1);
17962 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17963 assert(NewBR == User);
17967 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17968 Chain, Dest, CC, Cmp);
17969 X86::CondCode CCode =
17970 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
17971 CCode = X86::GetOppositeBranchCondition(CCode);
17972 CC = DAG.getConstant(CCode, dl, MVT::i8);
17978 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
17979 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
17980 // It should be transformed during dag combiner except when the condition
17981 // is set by a arithmetics with overflow node.
17982 X86::CondCode CCode =
17983 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17984 CCode = X86::GetOppositeBranchCondition(CCode);
17985 CC = DAG.getConstant(CCode, dl, MVT::i8);
17986 Cond = Cond.getOperand(0).getOperand(1);
17988 } else if (Cond.getOpcode() == ISD::SETCC &&
17989 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
17990 // For FCMP_OEQ, we can emit
17991 // two branches instead of an explicit AND instruction with a
17992 // separate test. However, we only do this if this block doesn't
17993 // have a fall-through edge, because this requires an explicit
17994 // jmp when the condition is false.
17995 if (Op.getNode()->hasOneUse()) {
17996 SDNode *User = *Op.getNode()->use_begin();
17997 // Look for an unconditional branch following this conditional branch.
17998 // We need this because we need to reverse the successors in order
17999 // to implement FCMP_OEQ.
18000 if (User->getOpcode() == ISD::BR) {
18001 SDValue FalseBB = User->getOperand(1);
18003 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18004 assert(NewBR == User);
18008 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18009 Cond.getOperand(0), Cond.getOperand(1));
18010 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18011 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18012 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18013 Chain, Dest, CC, Cmp);
18014 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18019 } else if (Cond.getOpcode() == ISD::SETCC &&
18020 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18021 // For FCMP_UNE, we can emit
18022 // two branches instead of an explicit AND instruction with a
18023 // separate test. However, we only do this if this block doesn't
18024 // have a fall-through edge, because this requires an explicit
18025 // jmp when the condition is false.
18026 if (Op.getNode()->hasOneUse()) {
18027 SDNode *User = *Op.getNode()->use_begin();
18028 // Look for an unconditional branch following this conditional branch.
18029 // We need this because we need to reverse the successors in order
18030 // to implement FCMP_UNE.
18031 if (User->getOpcode() == ISD::BR) {
18032 SDValue FalseBB = User->getOperand(1);
18034 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18035 assert(NewBR == User);
18038 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18039 Cond.getOperand(0), Cond.getOperand(1));
18040 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18041 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18042 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18043 Chain, Dest, CC, Cmp);
18044 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18054 // Look pass the truncate if the high bits are known zero.
18055 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18056 Cond = Cond.getOperand(0);
18058 // We know the result is compared against zero. Try to match it to BT.
18059 if (Cond.hasOneUse()) {
18060 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18061 CC = NewSetCC.getOperand(0);
18062 Cond = NewSetCC.getOperand(1);
18069 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18070 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18071 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18073 Cond = ConvertCmpIfNecessary(Cond, DAG);
18074 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18075 Chain, Dest, CC, Cond);
18078 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18079 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18080 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18081 // that the guard pages used by the OS virtual memory manager are allocated in
18082 // correct sequence.
18084 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18085 SelectionDAG &DAG) const {
18086 MachineFunction &MF = DAG.getMachineFunction();
18087 bool SplitStack = MF.shouldSplitStack();
18088 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18093 SDNode *Node = Op.getNode();
18094 SDValue Chain = Op.getOperand(0);
18095 SDValue Size = Op.getOperand(1);
18096 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18097 EVT VT = Node->getValueType(0);
18099 // Chain the dynamic stack allocation so that it doesn't modify the stack
18100 // pointer when other instructions are using the stack.
18101 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
18103 bool Is64Bit = Subtarget.is64Bit();
18104 MVT SPTy = getPointerTy(DAG.getDataLayout());
18108 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18109 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18110 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18111 " not tell us which reg is the stack pointer!");
18113 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18114 Chain = SP.getValue(1);
18115 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18116 unsigned StackAlign = TFI.getStackAlignment();
18117 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18118 if (Align > StackAlign)
18119 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18120 DAG.getConstant(-(uint64_t)Align, dl, VT));
18121 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18122 } else if (SplitStack) {
18123 MachineRegisterInfo &MRI = MF.getRegInfo();
18126 // The 64 bit implementation of segmented stacks needs to clobber both r10
18127 // r11. This makes it impossible to use it along with nested parameters.
18128 const Function *F = MF.getFunction();
18129 for (const auto &A : F->args()) {
18130 if (A.hasNestAttr())
18131 report_fatal_error("Cannot use segmented stacks with functions that "
18132 "have nested arguments.");
18136 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18137 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18138 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18139 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18140 DAG.getRegister(Vreg, SPTy));
18142 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18143 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18144 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18146 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18147 unsigned SPReg = RegInfo->getStackRegister();
18148 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18149 Chain = SP.getValue(1);
18152 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18153 DAG.getConstant(-(uint64_t)Align, dl, VT));
18154 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18160 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18161 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18163 SDValue Ops[2] = {Result, Chain};
18164 return DAG.getMergeValues(Ops, dl);
18167 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18168 MachineFunction &MF = DAG.getMachineFunction();
18169 auto PtrVT = getPointerTy(MF.getDataLayout());
18170 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18172 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18175 if (!Subtarget.is64Bit() ||
18176 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18177 // vastart just stores the address of the VarArgsFrameIndex slot into the
18178 // memory location argument.
18179 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18180 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18181 MachinePointerInfo(SV));
18185 // gp_offset (0 - 6 * 8)
18186 // fp_offset (48 - 48 + 8 * 16)
18187 // overflow_arg_area (point to parameters coming in memory).
18189 SmallVector<SDValue, 8> MemOps;
18190 SDValue FIN = Op.getOperand(1);
18192 SDValue Store = DAG.getStore(
18193 Op.getOperand(0), DL,
18194 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18195 MachinePointerInfo(SV));
18196 MemOps.push_back(Store);
18199 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18200 Store = DAG.getStore(
18201 Op.getOperand(0), DL,
18202 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18203 MachinePointerInfo(SV, 4));
18204 MemOps.push_back(Store);
18206 // Store ptr to overflow_arg_area
18207 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18208 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18210 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18211 MemOps.push_back(Store);
18213 // Store ptr to reg_save_area.
18214 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18215 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18216 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18217 Store = DAG.getStore(
18218 Op.getOperand(0), DL, RSFIN, FIN,
18219 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18220 MemOps.push_back(Store);
18221 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18224 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18225 assert(Subtarget.is64Bit() &&
18226 "LowerVAARG only handles 64-bit va_arg!");
18227 assert(Op.getNumOperands() == 4);
18229 MachineFunction &MF = DAG.getMachineFunction();
18230 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18231 // The Win64 ABI uses char* instead of a structure.
18232 return DAG.expandVAArg(Op.getNode());
18234 SDValue Chain = Op.getOperand(0);
18235 SDValue SrcPtr = Op.getOperand(1);
18236 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18237 unsigned Align = Op.getConstantOperandVal(3);
18240 EVT ArgVT = Op.getNode()->getValueType(0);
18241 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18242 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18245 // Decide which area this value should be read from.
18246 // TODO: Implement the AMD64 ABI in its entirety. This simple
18247 // selection mechanism works only for the basic types.
18248 if (ArgVT == MVT::f80) {
18249 llvm_unreachable("va_arg for f80 not yet implemented");
18250 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18251 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18252 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18253 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18255 llvm_unreachable("Unhandled argument type in LowerVAARG");
18258 if (ArgMode == 2) {
18259 // Sanity Check: Make sure using fp_offset makes sense.
18260 assert(!Subtarget.useSoftFloat() &&
18261 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18262 Subtarget.hasSSE1());
18265 // Insert VAARG_64 node into the DAG
18266 // VAARG_64 returns two values: Variable Argument Address, Chain
18267 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18268 DAG.getConstant(ArgMode, dl, MVT::i8),
18269 DAG.getConstant(Align, dl, MVT::i32)};
18270 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18271 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18272 VTs, InstOps, MVT::i64,
18273 MachinePointerInfo(SV),
18275 /*Volatile=*/false,
18277 /*WriteMem=*/true);
18278 Chain = VAARG.getValue(1);
18280 // Load the next argument and return it
18281 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18284 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18285 SelectionDAG &DAG) {
18286 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18287 // where a va_list is still an i8*.
18288 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18289 if (Subtarget.isCallingConvWin64(
18290 DAG.getMachineFunction().getFunction()->getCallingConv()))
18291 // Probably a Win64 va_copy.
18292 return DAG.expandVACopy(Op.getNode());
18294 SDValue Chain = Op.getOperand(0);
18295 SDValue DstPtr = Op.getOperand(1);
18296 SDValue SrcPtr = Op.getOperand(2);
18297 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18298 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18301 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18302 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18304 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18307 /// Handle vector element shifts where the shift amount is a constant.
18308 /// Takes immediate version of shift as input.
18309 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18310 SDValue SrcOp, uint64_t ShiftAmt,
18311 SelectionDAG &DAG) {
18312 MVT ElementType = VT.getVectorElementType();
18314 // Fold this packed shift into its first operand if ShiftAmt is 0.
18318 // Check for ShiftAmt >= element width
18319 if (ShiftAmt >= ElementType.getSizeInBits()) {
18320 if (Opc == X86ISD::VSRAI)
18321 ShiftAmt = ElementType.getSizeInBits() - 1;
18323 return DAG.getConstant(0, dl, VT);
18326 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18327 && "Unknown target vector shift-by-constant node");
18329 // Fold this packed vector shift into a build vector if SrcOp is a
18330 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
18331 if (VT == SrcOp.getSimpleValueType() &&
18332 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18333 SmallVector<SDValue, 8> Elts;
18334 unsigned NumElts = SrcOp->getNumOperands();
18335 ConstantSDNode *ND;
18338 default: llvm_unreachable("Unknown opcode!");
18339 case X86ISD::VSHLI:
18340 for (unsigned i=0; i!=NumElts; ++i) {
18341 SDValue CurrentOp = SrcOp->getOperand(i);
18342 if (CurrentOp->isUndef()) {
18343 Elts.push_back(CurrentOp);
18346 ND = cast<ConstantSDNode>(CurrentOp);
18347 const APInt &C = ND->getAPIntValue();
18348 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18351 case X86ISD::VSRLI:
18352 for (unsigned i=0; i!=NumElts; ++i) {
18353 SDValue CurrentOp = SrcOp->getOperand(i);
18354 if (CurrentOp->isUndef()) {
18355 Elts.push_back(CurrentOp);
18358 ND = cast<ConstantSDNode>(CurrentOp);
18359 const APInt &C = ND->getAPIntValue();
18360 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18363 case X86ISD::VSRAI:
18364 for (unsigned i=0; i!=NumElts; ++i) {
18365 SDValue CurrentOp = SrcOp->getOperand(i);
18366 if (CurrentOp->isUndef()) {
18367 Elts.push_back(CurrentOp);
18370 ND = cast<ConstantSDNode>(CurrentOp);
18371 const APInt &C = ND->getAPIntValue();
18372 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18377 return DAG.getBuildVector(VT, dl, Elts);
18380 return DAG.getNode(Opc, dl, VT, SrcOp,
18381 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18384 /// Handle vector element shifts where the shift amount may or may not be a
18385 /// constant. Takes immediate version of shift as input.
18386 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18387 SDValue SrcOp, SDValue ShAmt,
18388 const X86Subtarget &Subtarget,
18389 SelectionDAG &DAG) {
18390 MVT SVT = ShAmt.getSimpleValueType();
18391 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18393 // Catch shift-by-constant.
18394 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18395 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18396 CShAmt->getZExtValue(), DAG);
18398 // Change opcode to non-immediate version
18400 default: llvm_unreachable("Unknown target vector shift node");
18401 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18402 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18403 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18406 // Need to build a vector containing shift amount.
18407 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18408 // +=================+============+=======================================+
18409 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18410 // +=================+============+=======================================+
18411 // | i64 | Yes, No | Use ShAmt as lowest elt |
18412 // | i32 | Yes | zero-extend in-reg |
18413 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18414 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18415 // +=================+============+=======================================+
18417 if (SVT == MVT::i64)
18418 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18419 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18420 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18421 SDValue Op0 = ShAmt.getOperand(0);
18422 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
18423 ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
18424 } else if (Subtarget.hasSSE41() &&
18425 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18426 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18427 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18429 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18430 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18431 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18434 // The return type has to be a 128-bit type with the same element
18435 // type as the input type.
18436 MVT EltVT = VT.getVectorElementType();
18437 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18439 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18440 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18443 /// \brief Return Mask with the necessary casting or extending
18444 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18445 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18446 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18449 if (isAllOnesConstant(Mask))
18450 return DAG.getTargetConstant(1, dl, MaskVT);
18451 if (X86::isZeroNode(Mask))
18452 return DAG.getTargetConstant(0, dl, MaskVT);
18454 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18455 // Mask should be extended
18456 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18457 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18460 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18461 if (MaskVT == MVT::v64i1) {
18462 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18463 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18465 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18466 DAG.getConstant(0, dl, MVT::i32));
18467 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18468 DAG.getConstant(1, dl, MVT::i32));
18470 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18471 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18473 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18475 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18477 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18478 return DAG.getBitcast(MaskVT,
18479 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18483 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18484 Mask.getSimpleValueType().getSizeInBits());
18485 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
18486 // are extracted by EXTRACT_SUBVECTOR.
18487 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18488 DAG.getBitcast(BitcastVT, Mask),
18489 DAG.getIntPtrConstant(0, dl));
18493 /// \brief Return (and \p Op, \p Mask) for compare instructions or
18494 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
18495 /// necessary casting or extending for \p Mask when lowering masking intrinsics
18496 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
18497 SDValue PreservedSrc,
18498 const X86Subtarget &Subtarget,
18499 SelectionDAG &DAG) {
18500 MVT VT = Op.getSimpleValueType();
18501 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18502 unsigned OpcodeSelect = ISD::VSELECT;
18505 if (isAllOnesConstant(Mask))
18508 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18510 switch (Op.getOpcode()) {
18512 case X86ISD::PCMPEQM:
18513 case X86ISD::PCMPGTM:
18515 case X86ISD::CMPMU:
18516 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
18517 case X86ISD::VFPCLASS:
18518 case X86ISD::VFPCLASSS:
18519 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
18520 case X86ISD::VTRUNC:
18521 case X86ISD::VTRUNCS:
18522 case X86ISD::VTRUNCUS:
18523 case X86ISD::CVTPS2PH:
18524 // We can't use ISD::VSELECT here because it is not always "Legal"
18525 // for the destination type. For example vpmovqb require only AVX512
18526 // and vselect that can operate on byte element type require BWI
18527 OpcodeSelect = X86ISD::SELECT;
18530 if (PreservedSrc.isUndef())
18531 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18532 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
18535 /// \brief Creates an SDNode for a predicated scalar operation.
18536 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
18537 /// The mask is coming as MVT::i8 and it should be truncated
18538 /// to MVT::i1 while lowering masking intrinsics.
18539 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
18540 /// "X86select" instead of "vselect". We just can't create the "vselect" node
18541 /// for a scalar instruction.
18542 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
18543 SDValue PreservedSrc,
18544 const X86Subtarget &Subtarget,
18545 SelectionDAG &DAG) {
18546 if (isAllOnesConstant(Mask))
18549 MVT VT = Op.getSimpleValueType();
18551 // The mask should be of type MVT::i1
18552 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
18554 if (Op.getOpcode() == X86ISD::FSETCCM ||
18555 Op.getOpcode() == X86ISD::FSETCCM_RND)
18556 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
18557 if (Op.getOpcode() == X86ISD::VFPCLASS ||
18558 Op.getOpcode() == X86ISD::VFPCLASSS)
18559 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
18561 if (PreservedSrc.isUndef())
18562 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18563 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
18566 static int getSEHRegistrationNodeSize(const Function *Fn) {
18567 if (!Fn->hasPersonalityFn())
18568 report_fatal_error(
18569 "querying registration node size for function without personality");
18570 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
18571 // WinEHStatePass for the full struct definition.
18572 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
18573 case EHPersonality::MSVC_X86SEH: return 24;
18574 case EHPersonality::MSVC_CXX: return 16;
18577 report_fatal_error(
18578 "can only recover FP for 32-bit MSVC EH personality functions");
18581 /// When the MSVC runtime transfers control to us, either to an outlined
18582 /// function or when returning to a parent frame after catching an exception, we
18583 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
18584 /// Here's the math:
18585 /// RegNodeBase = EntryEBP - RegNodeSize
18586 /// ParentFP = RegNodeBase - ParentFrameOffset
18587 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
18588 /// subtracting the offset (negative on x86) takes us back to the parent FP.
18589 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
18590 SDValue EntryEBP) {
18591 MachineFunction &MF = DAG.getMachineFunction();
18594 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18595 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18597 // It's possible that the parent function no longer has a personality function
18598 // if the exceptional code was optimized away, in which case we just return
18599 // the incoming EBP.
18600 if (!Fn->hasPersonalityFn())
18603 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
18604 // registration, or the .set_setframe offset.
18605 MCSymbol *OffsetSym =
18606 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
18607 GlobalValue::getRealLinkageName(Fn->getName()));
18608 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
18609 SDValue ParentFrameOffset =
18610 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
18612 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
18613 // prologue to RBP in the parent function.
18614 const X86Subtarget &Subtarget =
18615 static_cast<const X86Subtarget &>(DAG.getSubtarget());
18616 if (Subtarget.is64Bit())
18617 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
18619 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
18620 // RegNodeBase = EntryEBP - RegNodeSize
18621 // ParentFP = RegNodeBase - ParentFrameOffset
18622 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
18623 DAG.getConstant(RegNodeSize, dl, PtrVT));
18624 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
18627 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18628 SelectionDAG &DAG) {
18629 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
18630 auto isRoundModeCurDirection = [](SDValue Rnd) {
18631 if (!isa<ConstantSDNode>(Rnd))
18634 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
18635 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
18639 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18640 MVT VT = Op.getSimpleValueType();
18641 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
18643 switch(IntrData->Type) {
18644 case INTR_TYPE_1OP:
18645 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
18646 case INTR_TYPE_2OP:
18647 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18649 case INTR_TYPE_3OP:
18650 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18651 Op.getOperand(2), Op.getOperand(3));
18652 case INTR_TYPE_4OP:
18653 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18654 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
18655 case INTR_TYPE_1OP_MASK_RM: {
18656 SDValue Src = Op.getOperand(1);
18657 SDValue PassThru = Op.getOperand(2);
18658 SDValue Mask = Op.getOperand(3);
18659 SDValue RoundingMode;
18660 // We always add rounding mode to the Node.
18661 // If the rounding mode is not specified, we add the
18662 // "current direction" mode.
18663 if (Op.getNumOperands() == 4)
18665 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18667 RoundingMode = Op.getOperand(4);
18668 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
18669 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18671 Mask, PassThru, Subtarget, DAG);
18673 case INTR_TYPE_1OP_MASK: {
18674 SDValue Src = Op.getOperand(1);
18675 SDValue PassThru = Op.getOperand(2);
18676 SDValue Mask = Op.getOperand(3);
18677 // We add rounding mode to the Node when
18678 // - RM Opcode is specified and
18679 // - RM is not "current direction".
18680 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18681 if (IntrWithRoundingModeOpcode != 0) {
18682 SDValue Rnd = Op.getOperand(4);
18683 if (!isRoundModeCurDirection(Rnd)) {
18684 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18685 dl, Op.getValueType(),
18687 Mask, PassThru, Subtarget, DAG);
18690 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
18691 Mask, PassThru, Subtarget, DAG);
18693 case INTR_TYPE_SCALAR_MASK: {
18694 SDValue Src1 = Op.getOperand(1);
18695 SDValue Src2 = Op.getOperand(2);
18696 SDValue passThru = Op.getOperand(3);
18697 SDValue Mask = Op.getOperand(4);
18698 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
18699 Mask, passThru, Subtarget, DAG);
18701 case INTR_TYPE_SCALAR_MASK_RM: {
18702 SDValue Src1 = Op.getOperand(1);
18703 SDValue Src2 = Op.getOperand(2);
18704 SDValue Src0 = Op.getOperand(3);
18705 SDValue Mask = Op.getOperand(4);
18706 // There are 2 kinds of intrinsics in this group:
18707 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
18708 // (2) With rounding mode and sae - 7 operands.
18709 if (Op.getNumOperands() == 6) {
18710 SDValue Sae = Op.getOperand(5);
18711 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18713 Mask, Src0, Subtarget, DAG);
18715 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
18716 SDValue RoundingMode = Op.getOperand(5);
18717 SDValue Sae = Op.getOperand(6);
18718 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18719 RoundingMode, Sae),
18720 Mask, Src0, Subtarget, DAG);
18722 case INTR_TYPE_2OP_MASK:
18723 case INTR_TYPE_2OP_IMM8_MASK: {
18724 SDValue Src1 = Op.getOperand(1);
18725 SDValue Src2 = Op.getOperand(2);
18726 SDValue PassThru = Op.getOperand(3);
18727 SDValue Mask = Op.getOperand(4);
18729 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
18730 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
18732 // We specify 2 possible opcodes for intrinsics with rounding modes.
18733 // First, we check if the intrinsic may have non-default rounding mode,
18734 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18735 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18736 if (IntrWithRoundingModeOpcode != 0) {
18737 SDValue Rnd = Op.getOperand(5);
18738 if (!isRoundModeCurDirection(Rnd)) {
18739 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18740 dl, Op.getValueType(),
18742 Mask, PassThru, Subtarget, DAG);
18745 // TODO: Intrinsics should have fast-math-flags to propagate.
18746 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
18747 Mask, PassThru, Subtarget, DAG);
18749 case INTR_TYPE_2OP_MASK_RM: {
18750 SDValue Src1 = Op.getOperand(1);
18751 SDValue Src2 = Op.getOperand(2);
18752 SDValue PassThru = Op.getOperand(3);
18753 SDValue Mask = Op.getOperand(4);
18754 // We specify 2 possible modes for intrinsics, with/without rounding
18756 // First, we check if the intrinsic have rounding mode (6 operands),
18757 // if not, we set rounding mode to "current".
18759 if (Op.getNumOperands() == 6)
18760 Rnd = Op.getOperand(5);
18762 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18763 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18765 Mask, PassThru, Subtarget, DAG);
18767 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
18768 SDValue Src1 = Op.getOperand(1);
18769 SDValue Src2 = Op.getOperand(2);
18770 SDValue Src3 = Op.getOperand(3);
18771 SDValue PassThru = Op.getOperand(4);
18772 SDValue Mask = Op.getOperand(5);
18773 SDValue Sae = Op.getOperand(6);
18775 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
18777 Mask, PassThru, Subtarget, DAG);
18779 case INTR_TYPE_3OP_MASK_RM: {
18780 SDValue Src1 = Op.getOperand(1);
18781 SDValue Src2 = Op.getOperand(2);
18782 SDValue Imm = Op.getOperand(3);
18783 SDValue PassThru = Op.getOperand(4);
18784 SDValue Mask = Op.getOperand(5);
18785 // We specify 2 possible modes for intrinsics, with/without rounding
18787 // First, we check if the intrinsic have rounding mode (7 operands),
18788 // if not, we set rounding mode to "current".
18790 if (Op.getNumOperands() == 7)
18791 Rnd = Op.getOperand(6);
18793 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18794 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18795 Src1, Src2, Imm, Rnd),
18796 Mask, PassThru, Subtarget, DAG);
18798 case INTR_TYPE_3OP_IMM8_MASK:
18799 case INTR_TYPE_3OP_MASK: {
18800 SDValue Src1 = Op.getOperand(1);
18801 SDValue Src2 = Op.getOperand(2);
18802 SDValue Src3 = Op.getOperand(3);
18803 SDValue PassThru = Op.getOperand(4);
18804 SDValue Mask = Op.getOperand(5);
18806 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
18807 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
18809 // We specify 2 possible opcodes for intrinsics with rounding modes.
18810 // First, we check if the intrinsic may have non-default rounding mode,
18811 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18812 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18813 if (IntrWithRoundingModeOpcode != 0) {
18814 SDValue Rnd = Op.getOperand(6);
18815 if (!isRoundModeCurDirection(Rnd)) {
18816 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18817 dl, Op.getValueType(),
18818 Src1, Src2, Src3, Rnd),
18819 Mask, PassThru, Subtarget, DAG);
18822 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18824 Mask, PassThru, Subtarget, DAG);
18826 case VPERM_2OP_MASK : {
18827 SDValue Src1 = Op.getOperand(1);
18828 SDValue Src2 = Op.getOperand(2);
18829 SDValue PassThru = Op.getOperand(3);
18830 SDValue Mask = Op.getOperand(4);
18832 // Swap Src1 and Src2 in the node creation
18833 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
18834 Mask, PassThru, Subtarget, DAG);
18836 case VPERM_3OP_MASKZ:
18837 case VPERM_3OP_MASK:{
18838 MVT VT = Op.getSimpleValueType();
18839 // Src2 is the PassThru
18840 SDValue Src1 = Op.getOperand(1);
18841 // PassThru needs to be the same type as the destination in order
18842 // to pattern match correctly.
18843 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
18844 SDValue Src3 = Op.getOperand(3);
18845 SDValue Mask = Op.getOperand(4);
18846 SDValue PassThru = SDValue();
18848 // set PassThru element
18849 if (IntrData->Type == VPERM_3OP_MASKZ)
18850 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18854 // Swap Src1 and Src2 in the node creation
18855 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18856 dl, Op.getValueType(),
18858 Mask, PassThru, Subtarget, DAG);
18862 case FMA_OP_MASK: {
18863 SDValue Src1 = Op.getOperand(1);
18864 SDValue Src2 = Op.getOperand(2);
18865 SDValue Src3 = Op.getOperand(3);
18866 SDValue Mask = Op.getOperand(4);
18867 MVT VT = Op.getSimpleValueType();
18868 SDValue PassThru = SDValue();
18870 // set PassThru element
18871 if (IntrData->Type == FMA_OP_MASKZ)
18872 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18873 else if (IntrData->Type == FMA_OP_MASK3)
18878 // We specify 2 possible opcodes for intrinsics with rounding modes.
18879 // First, we check if the intrinsic may have non-default rounding mode,
18880 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18881 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18882 if (IntrWithRoundingModeOpcode != 0) {
18883 SDValue Rnd = Op.getOperand(5);
18884 if (!isRoundModeCurDirection(Rnd))
18885 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18886 dl, Op.getValueType(),
18887 Src1, Src2, Src3, Rnd),
18888 Mask, PassThru, Subtarget, DAG);
18890 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18891 dl, Op.getValueType(),
18893 Mask, PassThru, Subtarget, DAG);
18895 case FMA_OP_SCALAR_MASK:
18896 case FMA_OP_SCALAR_MASK3:
18897 case FMA_OP_SCALAR_MASKZ: {
18898 SDValue Src1 = Op.getOperand(1);
18899 SDValue Src2 = Op.getOperand(2);
18900 SDValue Src3 = Op.getOperand(3);
18901 SDValue Mask = Op.getOperand(4);
18902 MVT VT = Op.getSimpleValueType();
18903 SDValue PassThru = SDValue();
18905 // set PassThru element
18906 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
18907 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18908 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
18913 SDValue Rnd = Op.getOperand(5);
18914 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
18915 Op.getValueType(), Src1, Src2,
18917 Mask, PassThru, Subtarget, DAG);
18919 case TERLOG_OP_MASK:
18920 case TERLOG_OP_MASKZ: {
18921 SDValue Src1 = Op.getOperand(1);
18922 SDValue Src2 = Op.getOperand(2);
18923 SDValue Src3 = Op.getOperand(3);
18924 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
18925 SDValue Mask = Op.getOperand(5);
18926 MVT VT = Op.getSimpleValueType();
18927 SDValue PassThru = Src1;
18928 // Set PassThru element.
18929 if (IntrData->Type == TERLOG_OP_MASKZ)
18930 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18932 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18933 Src1, Src2, Src3, Src4),
18934 Mask, PassThru, Subtarget, DAG);
18937 // ISD::FP_ROUND has a second argument that indicates if the truncation
18938 // does not change the value. Set it to 0 since it can change.
18939 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
18940 DAG.getIntPtrConstant(0, dl));
18941 case CVTPD2PS_MASK: {
18942 SDValue Src = Op.getOperand(1);
18943 SDValue PassThru = Op.getOperand(2);
18944 SDValue Mask = Op.getOperand(3);
18945 // We add rounding mode to the Node when
18946 // - RM Opcode is specified and
18947 // - RM is not "current direction".
18948 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18949 if (IntrWithRoundingModeOpcode != 0) {
18950 SDValue Rnd = Op.getOperand(4);
18951 if (!isRoundModeCurDirection(Rnd)) {
18952 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18953 dl, Op.getValueType(),
18955 Mask, PassThru, Subtarget, DAG);
18958 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
18959 // ISD::FP_ROUND has a second argument that indicates if the truncation
18960 // does not change the value. Set it to 0 since it can change.
18961 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18962 DAG.getIntPtrConstant(0, dl)),
18963 Mask, PassThru, Subtarget, DAG);
18966 // FPclass intrinsics with mask
18967 SDValue Src1 = Op.getOperand(1);
18968 MVT VT = Src1.getSimpleValueType();
18969 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18970 SDValue Imm = Op.getOperand(2);
18971 SDValue Mask = Op.getOperand(3);
18972 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18973 Mask.getSimpleValueType().getSizeInBits());
18974 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
18975 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
18976 DAG.getTargetConstant(0, dl, MaskVT),
18978 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
18979 DAG.getUNDEF(BitcastVT), FPclassMask,
18980 DAG.getIntPtrConstant(0, dl));
18981 return DAG.getBitcast(Op.getValueType(), Res);
18984 SDValue Src1 = Op.getOperand(1);
18985 SDValue Imm = Op.getOperand(2);
18986 SDValue Mask = Op.getOperand(3);
18987 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
18988 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
18989 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
18990 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
18993 case CMP_MASK_CC: {
18994 // Comparison intrinsics with masks.
18995 // Example of transformation:
18996 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
18997 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
18999 // (v8i1 (insert_subvector undef,
19000 // (v2i1 (and (PCMPEQM %a, %b),
19001 // (extract_subvector
19002 // (v8i1 (bitcast %mask)), 0))), 0))))
19003 MVT VT = Op.getOperand(1).getSimpleValueType();
19004 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19005 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19006 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19007 Mask.getSimpleValueType().getSizeInBits());
19009 if (IntrData->Type == CMP_MASK_CC) {
19010 SDValue CC = Op.getOperand(3);
19011 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19012 // We specify 2 possible opcodes for intrinsics with rounding modes.
19013 // First, we check if the intrinsic may have non-default rounding mode,
19014 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19015 if (IntrData->Opc1 != 0) {
19016 SDValue Rnd = Op.getOperand(5);
19017 if (!isRoundModeCurDirection(Rnd))
19018 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19019 Op.getOperand(2), CC, Rnd);
19021 //default rounding mode
19023 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19024 Op.getOperand(2), CC);
19027 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19028 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19031 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19032 DAG.getTargetConstant(0, dl,
19035 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19036 DAG.getUNDEF(BitcastVT), CmpMask,
19037 DAG.getIntPtrConstant(0, dl));
19038 return DAG.getBitcast(Op.getValueType(), Res);
19040 case CMP_MASK_SCALAR_CC: {
19041 SDValue Src1 = Op.getOperand(1);
19042 SDValue Src2 = Op.getOperand(2);
19043 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19044 SDValue Mask = Op.getOperand(4);
19047 if (IntrData->Opc1 != 0) {
19048 SDValue Rnd = Op.getOperand(5);
19049 if (!isRoundModeCurDirection(Rnd))
19050 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
19052 //default rounding mode
19054 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
19056 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19057 DAG.getTargetConstant(0, dl,
19061 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
19063 case COMI: { // Comparison intrinsics
19064 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19065 SDValue LHS = Op.getOperand(1);
19066 SDValue RHS = Op.getOperand(2);
19067 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19068 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19071 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19072 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19073 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19074 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19077 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19078 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19079 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19080 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19083 case ISD::SETGT: // (CF = 0 and ZF = 0)
19084 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19086 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19087 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19090 case ISD::SETGE: // CF = 0
19091 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19093 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19094 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19097 llvm_unreachable("Unexpected illegal condition!");
19099 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19101 case COMI_RM: { // Comparison intrinsics with Sae
19102 SDValue LHS = Op.getOperand(1);
19103 SDValue RHS = Op.getOperand(2);
19104 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19105 SDValue Sae = Op.getOperand(4);
19108 if (isRoundModeCurDirection(Sae))
19109 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
19110 DAG.getConstant(CondVal, dl, MVT::i8));
19112 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
19113 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19114 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
19115 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
19118 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19119 Op.getOperand(1), Op.getOperand(2), Subtarget,
19121 case COMPRESS_EXPAND_IN_REG: {
19122 SDValue Mask = Op.getOperand(3);
19123 SDValue DataToCompress = Op.getOperand(1);
19124 SDValue PassThru = Op.getOperand(2);
19125 if (isAllOnesConstant(Mask)) // return data as is
19126 return Op.getOperand(1);
19128 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19130 Mask, PassThru, Subtarget, DAG);
19133 SDValue Mask = Op.getOperand(1);
19134 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19135 Mask.getSimpleValueType().getSizeInBits());
19136 Mask = DAG.getBitcast(MaskVT, Mask);
19137 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19140 MVT VT = Op.getSimpleValueType();
19141 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19143 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19144 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19145 // Arguments should be swapped.
19146 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19147 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19149 return DAG.getBitcast(VT, Res);
19152 case FIXUPIMMS_MASKZ:
19154 case FIXUPIMM_MASKZ:{
19155 SDValue Src1 = Op.getOperand(1);
19156 SDValue Src2 = Op.getOperand(2);
19157 SDValue Src3 = Op.getOperand(3);
19158 SDValue Imm = Op.getOperand(4);
19159 SDValue Mask = Op.getOperand(5);
19160 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19161 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19162 // We specify 2 possible modes for intrinsics, with/without rounding
19164 // First, we check if the intrinsic have rounding mode (7 operands),
19165 // if not, we set rounding mode to "current".
19167 if (Op.getNumOperands() == 7)
19168 Rnd = Op.getOperand(6);
19170 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19171 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19172 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19173 Src1, Src2, Src3, Imm, Rnd),
19174 Mask, Passthru, Subtarget, DAG);
19175 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19176 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19177 Src1, Src2, Src3, Imm, Rnd),
19178 Mask, Passthru, Subtarget, DAG);
19180 case CONVERT_TO_MASK: {
19181 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19182 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19183 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19185 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19187 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19188 DAG.getUNDEF(BitcastVT), CvtMask,
19189 DAG.getIntPtrConstant(0, dl));
19190 return DAG.getBitcast(Op.getValueType(), Res);
19192 case CONVERT_MASK_TO_VEC: {
19193 SDValue Mask = Op.getOperand(1);
19194 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19195 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19196 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19198 case BRCST_SUBVEC_TO_VEC: {
19199 SDValue Src = Op.getOperand(1);
19200 SDValue Passthru = Op.getOperand(2);
19201 SDValue Mask = Op.getOperand(3);
19202 EVT resVT = Passthru.getValueType();
19203 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19204 DAG.getUNDEF(resVT), Src,
19205 DAG.getIntPtrConstant(0, dl));
19207 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19208 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19210 immVal = DAG.getConstant(0, dl, MVT::i8);
19211 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19212 subVec, subVec, immVal),
19213 Mask, Passthru, Subtarget, DAG);
19215 case BRCST32x2_TO_VEC: {
19216 SDValue Src = Op.getOperand(1);
19217 SDValue PassThru = Op.getOperand(2);
19218 SDValue Mask = Op.getOperand(3);
19220 assert((VT.getScalarType() == MVT::i32 ||
19221 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19222 //bitcast Src to packed 64
19223 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19224 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19225 Src = DAG.getBitcast(BitcastVT, Src);
19227 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19228 Mask, PassThru, Subtarget, DAG);
19236 default: return SDValue(); // Don't custom lower most intrinsics.
19238 case Intrinsic::x86_avx2_permd:
19239 case Intrinsic::x86_avx2_permps:
19240 // Operands intentionally swapped. Mask is last operand to intrinsic,
19241 // but second operand for node/instruction.
19242 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19243 Op.getOperand(2), Op.getOperand(1));
19245 // ptest and testp intrinsics. The intrinsic these come from are designed to
19246 // return an integer value, not just an instruction so lower it to the ptest
19247 // or testp pattern and a setcc for the result.
19248 case Intrinsic::x86_sse41_ptestz:
19249 case Intrinsic::x86_sse41_ptestc:
19250 case Intrinsic::x86_sse41_ptestnzc:
19251 case Intrinsic::x86_avx_ptestz_256:
19252 case Intrinsic::x86_avx_ptestc_256:
19253 case Intrinsic::x86_avx_ptestnzc_256:
19254 case Intrinsic::x86_avx_vtestz_ps:
19255 case Intrinsic::x86_avx_vtestc_ps:
19256 case Intrinsic::x86_avx_vtestnzc_ps:
19257 case Intrinsic::x86_avx_vtestz_pd:
19258 case Intrinsic::x86_avx_vtestc_pd:
19259 case Intrinsic::x86_avx_vtestnzc_pd:
19260 case Intrinsic::x86_avx_vtestz_ps_256:
19261 case Intrinsic::x86_avx_vtestc_ps_256:
19262 case Intrinsic::x86_avx_vtestnzc_ps_256:
19263 case Intrinsic::x86_avx_vtestz_pd_256:
19264 case Intrinsic::x86_avx_vtestc_pd_256:
19265 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19266 bool IsTestPacked = false;
19267 X86::CondCode X86CC;
19269 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19270 case Intrinsic::x86_avx_vtestz_ps:
19271 case Intrinsic::x86_avx_vtestz_pd:
19272 case Intrinsic::x86_avx_vtestz_ps_256:
19273 case Intrinsic::x86_avx_vtestz_pd_256:
19274 IsTestPacked = true;
19276 case Intrinsic::x86_sse41_ptestz:
19277 case Intrinsic::x86_avx_ptestz_256:
19279 X86CC = X86::COND_E;
19281 case Intrinsic::x86_avx_vtestc_ps:
19282 case Intrinsic::x86_avx_vtestc_pd:
19283 case Intrinsic::x86_avx_vtestc_ps_256:
19284 case Intrinsic::x86_avx_vtestc_pd_256:
19285 IsTestPacked = true;
19287 case Intrinsic::x86_sse41_ptestc:
19288 case Intrinsic::x86_avx_ptestc_256:
19290 X86CC = X86::COND_B;
19292 case Intrinsic::x86_avx_vtestnzc_ps:
19293 case Intrinsic::x86_avx_vtestnzc_pd:
19294 case Intrinsic::x86_avx_vtestnzc_ps_256:
19295 case Intrinsic::x86_avx_vtestnzc_pd_256:
19296 IsTestPacked = true;
19298 case Intrinsic::x86_sse41_ptestnzc:
19299 case Intrinsic::x86_avx_ptestnzc_256:
19301 X86CC = X86::COND_A;
19305 SDValue LHS = Op.getOperand(1);
19306 SDValue RHS = Op.getOperand(2);
19307 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19308 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19309 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19310 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19312 case Intrinsic::x86_avx512_kortestz_w:
19313 case Intrinsic::x86_avx512_kortestc_w: {
19314 X86::CondCode X86CC =
19315 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19316 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19317 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19318 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19319 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19320 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19323 case Intrinsic::x86_sse42_pcmpistria128:
19324 case Intrinsic::x86_sse42_pcmpestria128:
19325 case Intrinsic::x86_sse42_pcmpistric128:
19326 case Intrinsic::x86_sse42_pcmpestric128:
19327 case Intrinsic::x86_sse42_pcmpistrio128:
19328 case Intrinsic::x86_sse42_pcmpestrio128:
19329 case Intrinsic::x86_sse42_pcmpistris128:
19330 case Intrinsic::x86_sse42_pcmpestris128:
19331 case Intrinsic::x86_sse42_pcmpistriz128:
19332 case Intrinsic::x86_sse42_pcmpestriz128: {
19334 X86::CondCode X86CC;
19336 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19337 case Intrinsic::x86_sse42_pcmpistria128:
19338 Opcode = X86ISD::PCMPISTRI;
19339 X86CC = X86::COND_A;
19341 case Intrinsic::x86_sse42_pcmpestria128:
19342 Opcode = X86ISD::PCMPESTRI;
19343 X86CC = X86::COND_A;
19345 case Intrinsic::x86_sse42_pcmpistric128:
19346 Opcode = X86ISD::PCMPISTRI;
19347 X86CC = X86::COND_B;
19349 case Intrinsic::x86_sse42_pcmpestric128:
19350 Opcode = X86ISD::PCMPESTRI;
19351 X86CC = X86::COND_B;
19353 case Intrinsic::x86_sse42_pcmpistrio128:
19354 Opcode = X86ISD::PCMPISTRI;
19355 X86CC = X86::COND_O;
19357 case Intrinsic::x86_sse42_pcmpestrio128:
19358 Opcode = X86ISD::PCMPESTRI;
19359 X86CC = X86::COND_O;
19361 case Intrinsic::x86_sse42_pcmpistris128:
19362 Opcode = X86ISD::PCMPISTRI;
19363 X86CC = X86::COND_S;
19365 case Intrinsic::x86_sse42_pcmpestris128:
19366 Opcode = X86ISD::PCMPESTRI;
19367 X86CC = X86::COND_S;
19369 case Intrinsic::x86_sse42_pcmpistriz128:
19370 Opcode = X86ISD::PCMPISTRI;
19371 X86CC = X86::COND_E;
19373 case Intrinsic::x86_sse42_pcmpestriz128:
19374 Opcode = X86ISD::PCMPESTRI;
19375 X86CC = X86::COND_E;
19378 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19379 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19380 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19381 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19382 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19385 case Intrinsic::x86_sse42_pcmpistri128:
19386 case Intrinsic::x86_sse42_pcmpestri128: {
19388 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19389 Opcode = X86ISD::PCMPISTRI;
19391 Opcode = X86ISD::PCMPESTRI;
19393 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19394 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19395 return DAG.getNode(Opcode, dl, VTs, NewOps);
19398 case Intrinsic::eh_sjlj_lsda: {
19399 MachineFunction &MF = DAG.getMachineFunction();
19400 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19401 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19402 auto &Context = MF.getMMI().getContext();
19403 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19404 Twine(MF.getFunctionNumber()));
19405 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19408 case Intrinsic::x86_seh_lsda: {
19409 // Compute the symbol for the LSDA. We know it'll get emitted later.
19410 MachineFunction &MF = DAG.getMachineFunction();
19411 SDValue Op1 = Op.getOperand(1);
19412 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19413 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19414 GlobalValue::getRealLinkageName(Fn->getName()));
19416 // Generate a simple absolute symbol reference. This intrinsic is only
19417 // supported on 32-bit Windows, which isn't PIC.
19418 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19419 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19422 case Intrinsic::x86_seh_recoverfp: {
19423 SDValue FnOp = Op.getOperand(1);
19424 SDValue IncomingFPOp = Op.getOperand(2);
19425 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19426 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19428 report_fatal_error(
19429 "llvm.x86.seh.recoverfp must take a function as the first argument");
19430 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19433 case Intrinsic::localaddress: {
19434 // Returns one of the stack, base, or frame pointer registers, depending on
19435 // which is used to reference local variables.
19436 MachineFunction &MF = DAG.getMachineFunction();
19437 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19439 if (RegInfo->hasBasePointer(MF))
19440 Reg = RegInfo->getBaseRegister();
19441 else // This function handles the SP or FP case.
19442 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19443 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
19448 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19449 SDValue Src, SDValue Mask, SDValue Base,
19450 SDValue Index, SDValue ScaleOp, SDValue Chain,
19451 const X86Subtarget &Subtarget) {
19453 auto *C = cast<ConstantSDNode>(ScaleOp);
19454 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19455 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19456 Index.getSimpleValueType().getVectorNumElements());
19458 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19459 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
19460 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19461 SDValue Segment = DAG.getRegister(0, MVT::i32);
19463 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
19464 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
19465 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19466 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
19467 return DAG.getMergeValues(RetOps, dl);
19470 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19471 SDValue Src, SDValue Mask, SDValue Base,
19472 SDValue Index, SDValue ScaleOp, SDValue Chain,
19473 const X86Subtarget &Subtarget) {
19475 auto *C = cast<ConstantSDNode>(ScaleOp);
19476 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19477 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19478 SDValue Segment = DAG.getRegister(0, MVT::i32);
19479 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19480 Index.getSimpleValueType().getVectorNumElements());
19482 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19483 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
19484 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
19485 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19486 return SDValue(Res, 1);
19489 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19490 SDValue Mask, SDValue Base, SDValue Index,
19491 SDValue ScaleOp, SDValue Chain,
19492 const X86Subtarget &Subtarget) {
19494 auto *C = cast<ConstantSDNode>(ScaleOp);
19495 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19496 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19497 SDValue Segment = DAG.getRegister(0, MVT::i32);
19499 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
19500 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19501 //SDVTList VTs = DAG.getVTList(MVT::Other);
19502 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
19503 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
19504 return SDValue(Res, 0);
19507 /// Handles the lowering of builtin intrinsic that return the value
19508 /// of the extended control register.
19509 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
19511 const X86Subtarget &Subtarget,
19512 SmallVectorImpl<SDValue> &Results) {
19513 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19514 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19517 // The ECX register is used to select the index of the XCR register to
19520 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
19521 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
19522 Chain = SDValue(N1, 0);
19524 // Reads the content of XCR and returns it in registers EDX:EAX.
19525 if (Subtarget.is64Bit()) {
19526 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
19527 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19530 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
19531 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19534 Chain = HI.getValue(1);
19536 if (Subtarget.is64Bit()) {
19537 // Merge the two 32-bit values into a 64-bit one..
19538 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19539 DAG.getConstant(32, DL, MVT::i8));
19540 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19541 Results.push_back(Chain);
19545 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19546 SDValue Ops[] = { LO, HI };
19547 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19548 Results.push_back(Pair);
19549 Results.push_back(Chain);
19552 /// Handles the lowering of builtin intrinsics that read performance monitor
19553 /// counters (x86_rdpmc).
19554 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
19556 const X86Subtarget &Subtarget,
19557 SmallVectorImpl<SDValue> &Results) {
19558 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19559 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19562 // The ECX register is used to select the index of the performance counter
19564 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
19566 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
19568 // Reads the content of a 64-bit performance counter and returns it in the
19569 // registers EDX:EAX.
19570 if (Subtarget.is64Bit()) {
19571 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19572 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19575 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19576 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19579 Chain = HI.getValue(1);
19581 if (Subtarget.is64Bit()) {
19582 // The EAX register is loaded with the low-order 32 bits. The EDX register
19583 // is loaded with the supported high-order bits of the counter.
19584 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19585 DAG.getConstant(32, DL, MVT::i8));
19586 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19587 Results.push_back(Chain);
19591 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19592 SDValue Ops[] = { LO, HI };
19593 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19594 Results.push_back(Pair);
19595 Results.push_back(Chain);
19598 /// Handles the lowering of builtin intrinsics that read the time stamp counter
19599 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
19600 /// READCYCLECOUNTER nodes.
19601 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
19603 const X86Subtarget &Subtarget,
19604 SmallVectorImpl<SDValue> &Results) {
19605 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19606 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
19609 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
19610 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
19611 // and the EAX register is loaded with the low-order 32 bits.
19612 if (Subtarget.is64Bit()) {
19613 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19614 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19617 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19618 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19621 SDValue Chain = HI.getValue(1);
19623 if (Opcode == X86ISD::RDTSCP_DAG) {
19624 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19626 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
19627 // the ECX register. Add 'ecx' explicitly to the chain.
19628 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
19630 // Explicitly store the content of ECX at the location passed in input
19631 // to the 'rdtscp' intrinsic.
19632 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
19633 MachinePointerInfo());
19636 if (Subtarget.is64Bit()) {
19637 // The EDX register is loaded with the high-order 32 bits of the MSR, and
19638 // the EAX register is loaded with the low-order 32 bits.
19639 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19640 DAG.getConstant(32, DL, MVT::i8));
19641 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19642 Results.push_back(Chain);
19646 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19647 SDValue Ops[] = { LO, HI };
19648 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19649 Results.push_back(Pair);
19650 Results.push_back(Chain);
19653 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
19654 SelectionDAG &DAG) {
19655 SmallVector<SDValue, 2> Results;
19657 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
19659 return DAG.getMergeValues(Results, DL);
19662 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
19663 MachineFunction &MF = DAG.getMachineFunction();
19664 SDValue Chain = Op.getOperand(0);
19665 SDValue RegNode = Op.getOperand(2);
19666 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19668 report_fatal_error("EH registrations only live in functions using WinEH");
19670 // Cast the operand to an alloca, and remember the frame index.
19671 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
19673 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
19674 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
19676 // Return the chain operand without making any DAG nodes.
19680 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
19681 MachineFunction &MF = DAG.getMachineFunction();
19682 SDValue Chain = Op.getOperand(0);
19683 SDValue EHGuard = Op.getOperand(2);
19684 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19686 report_fatal_error("EHGuard only live in functions using WinEH");
19688 // Cast the operand to an alloca, and remember the frame index.
19689 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
19691 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
19692 EHInfo->EHGuardFrameIndex = FINode->getIndex();
19694 // Return the chain operand without making any DAG nodes.
19698 /// Emit Truncating Store with signed or unsigned saturation.
19700 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
19701 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
19702 SelectionDAG &DAG) {
19704 SDVTList VTs = DAG.getVTList(MVT::Other);
19705 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
19706 SDValue Ops[] = { Chain, Val, Ptr, Undef };
19708 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19709 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19712 /// Emit Masked Truncating Store with signed or unsigned saturation.
19714 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
19715 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
19716 MachineMemOperand *MMO, SelectionDAG &DAG) {
19718 SDVTList VTs = DAG.getVTList(MVT::Other);
19719 SDValue Ops[] = { Chain, Ptr, Mask, Val };
19721 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19722 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19725 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19726 SelectionDAG &DAG) {
19727 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
19729 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
19731 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
19732 return MarkEHRegistrationNode(Op, DAG);
19733 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
19734 return MarkEHGuard(Op, DAG);
19735 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
19736 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
19737 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
19738 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
19739 // We need a frame pointer because this will get lowered to a PUSH/POP
19741 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19742 MFI.setHasCopyImplyingStackAdjustment(true);
19743 // Don't do anything here, we will expand these intrinsics out later
19744 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
19751 switch(IntrData->Type) {
19752 default: llvm_unreachable("Unknown Intrinsic Type");
19755 // Emit the node with the right value type.
19756 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
19757 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19759 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
19760 // Otherwise return the value from Rand, which is always 0, casted to i32.
19761 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
19762 DAG.getConstant(1, dl, Op->getValueType(1)),
19763 DAG.getConstant(X86::COND_B, dl, MVT::i32),
19764 SDValue(Result.getNode(), 1) };
19765 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
19766 DAG.getVTList(Op->getValueType(1), MVT::Glue),
19769 // Return { result, isValid, chain }.
19770 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
19771 SDValue(Result.getNode(), 2));
19774 //gather(v1, mask, index, base, scale);
19775 SDValue Chain = Op.getOperand(0);
19776 SDValue Src = Op.getOperand(2);
19777 SDValue Base = Op.getOperand(3);
19778 SDValue Index = Op.getOperand(4);
19779 SDValue Mask = Op.getOperand(5);
19780 SDValue Scale = Op.getOperand(6);
19781 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
19785 //scatter(base, mask, index, v1, scale);
19786 SDValue Chain = Op.getOperand(0);
19787 SDValue Base = Op.getOperand(2);
19788 SDValue Mask = Op.getOperand(3);
19789 SDValue Index = Op.getOperand(4);
19790 SDValue Src = Op.getOperand(5);
19791 SDValue Scale = Op.getOperand(6);
19792 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
19793 Scale, Chain, Subtarget);
19796 SDValue Hint = Op.getOperand(6);
19797 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
19798 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
19799 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
19800 SDValue Chain = Op.getOperand(0);
19801 SDValue Mask = Op.getOperand(2);
19802 SDValue Index = Op.getOperand(3);
19803 SDValue Base = Op.getOperand(4);
19804 SDValue Scale = Op.getOperand(5);
19805 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
19808 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
19810 SmallVector<SDValue, 2> Results;
19811 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
19813 return DAG.getMergeValues(Results, dl);
19815 // Read Performance Monitoring Counters.
19817 SmallVector<SDValue, 2> Results;
19818 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
19819 return DAG.getMergeValues(Results, dl);
19821 // Get Extended Control Register.
19823 SmallVector<SDValue, 2> Results;
19824 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
19825 return DAG.getMergeValues(Results, dl);
19827 // XTEST intrinsics.
19829 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19830 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19832 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
19833 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
19834 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
19835 Ret, SDValue(InTrans.getNode(), 1));
19839 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19840 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
19841 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
19842 DAG.getConstant(-1, dl, MVT::i8));
19843 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
19844 Op.getOperand(4), GenCF.getValue(1));
19845 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
19846 Op.getOperand(5), MachinePointerInfo());
19847 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
19848 SDValue Results[] = { SetCC, Store };
19849 return DAG.getMergeValues(Results, dl);
19851 case COMPRESS_TO_MEM: {
19852 SDValue Mask = Op.getOperand(4);
19853 SDValue DataToCompress = Op.getOperand(3);
19854 SDValue Addr = Op.getOperand(2);
19855 SDValue Chain = Op.getOperand(0);
19856 MVT VT = DataToCompress.getSimpleValueType();
19858 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19859 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19861 if (isAllOnesConstant(Mask)) // return just a store
19862 return DAG.getStore(Chain, dl, DataToCompress, Addr,
19863 MemIntr->getMemOperand());
19865 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19866 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19868 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
19869 MemIntr->getMemOperand(),
19870 false /* truncating */, true /* compressing */);
19872 case TRUNCATE_TO_MEM_VI8:
19873 case TRUNCATE_TO_MEM_VI16:
19874 case TRUNCATE_TO_MEM_VI32: {
19875 SDValue Mask = Op.getOperand(4);
19876 SDValue DataToTruncate = Op.getOperand(3);
19877 SDValue Addr = Op.getOperand(2);
19878 SDValue Chain = Op.getOperand(0);
19880 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19881 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19883 EVT MemVT = MemIntr->getMemoryVT();
19885 uint16_t TruncationOp = IntrData->Opc0;
19886 switch (TruncationOp) {
19887 case X86ISD::VTRUNC: {
19888 if (isAllOnesConstant(Mask)) // return just a truncate store
19889 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
19890 MemIntr->getMemOperand());
19892 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19893 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19895 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
19896 MemIntr->getMemOperand(), true /* truncating */);
19898 case X86ISD::VTRUNCUS:
19899 case X86ISD::VTRUNCS: {
19900 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
19901 if (isAllOnesConstant(Mask))
19902 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
19903 MemIntr->getMemOperand(), DAG);
19905 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19906 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19908 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
19909 VMask, MemVT, MemIntr->getMemOperand(), DAG);
19912 llvm_unreachable("Unsupported truncstore intrinsic");
19916 case EXPAND_FROM_MEM: {
19917 SDValue Mask = Op.getOperand(4);
19918 SDValue PassThru = Op.getOperand(3);
19919 SDValue Addr = Op.getOperand(2);
19920 SDValue Chain = Op.getOperand(0);
19921 MVT VT = Op.getSimpleValueType();
19923 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19924 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19926 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
19927 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
19928 if (X86::isZeroNode(Mask))
19929 return DAG.getUNDEF(VT);
19931 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19932 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19933 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
19934 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
19935 true /* expanding */);
19940 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
19941 SelectionDAG &DAG) const {
19942 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19943 MFI.setReturnAddressIsTaken(true);
19945 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
19948 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19950 EVT PtrVT = getPointerTy(DAG.getDataLayout());
19953 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
19954 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19955 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
19956 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
19957 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
19958 MachinePointerInfo());
19961 // Just load the return address.
19962 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
19963 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
19964 MachinePointerInfo());
19967 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
19968 SelectionDAG &DAG) const {
19969 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
19970 return getReturnAddressFrameIndex(DAG);
19973 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
19974 MachineFunction &MF = DAG.getMachineFunction();
19975 MachineFrameInfo &MFI = MF.getFrameInfo();
19976 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19977 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19978 EVT VT = Op.getValueType();
19980 MFI.setFrameAddressIsTaken(true);
19982 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
19983 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
19984 // is not possible to crawl up the stack without looking at the unwind codes
19986 int FrameAddrIndex = FuncInfo->getFAIndex();
19987 if (!FrameAddrIndex) {
19988 // Set up a frame object for the return address.
19989 unsigned SlotSize = RegInfo->getSlotSize();
19990 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
19991 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
19992 FuncInfo->setFAIndex(FrameAddrIndex);
19994 return DAG.getFrameIndex(FrameAddrIndex, VT);
19997 unsigned FrameReg =
19998 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
19999 SDLoc dl(Op); // FIXME probably not meaningful
20000 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20001 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20002 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20003 "Invalid Frame Register!");
20004 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20006 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20007 MachinePointerInfo());
20011 // FIXME? Maybe this could be a TableGen attribute on some registers and
20012 // this table could be generated automatically from RegInfo.
20013 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20014 SelectionDAG &DAG) const {
20015 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20016 const MachineFunction &MF = DAG.getMachineFunction();
20018 unsigned Reg = StringSwitch<unsigned>(RegName)
20019 .Case("esp", X86::ESP)
20020 .Case("rsp", X86::RSP)
20021 .Case("ebp", X86::EBP)
20022 .Case("rbp", X86::RBP)
20025 if (Reg == X86::EBP || Reg == X86::RBP) {
20026 if (!TFI.hasFP(MF))
20027 report_fatal_error("register " + StringRef(RegName) +
20028 " is allocatable: function has no frame pointer");
20031 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20032 unsigned FrameReg =
20033 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20034 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20035 "Invalid Frame Register!");
20043 report_fatal_error("Invalid register name global variable");
20046 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20047 SelectionDAG &DAG) const {
20048 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20049 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20052 unsigned X86TargetLowering::getExceptionPointerRegister(
20053 const Constant *PersonalityFn) const {
20054 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20055 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20057 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20060 unsigned X86TargetLowering::getExceptionSelectorRegister(
20061 const Constant *PersonalityFn) const {
20062 // Funclet personalities don't use selectors (the runtime does the selection).
20063 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20064 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20067 bool X86TargetLowering::needsFixedCatchObjects() const {
20068 return Subtarget.isTargetWin64();
20071 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20072 SDValue Chain = Op.getOperand(0);
20073 SDValue Offset = Op.getOperand(1);
20074 SDValue Handler = Op.getOperand(2);
20077 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20078 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20079 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20080 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20081 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20082 "Invalid Frame Register!");
20083 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20084 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20086 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20087 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20089 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20090 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20091 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20093 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20094 DAG.getRegister(StoreAddrReg, PtrVT));
20097 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20098 SelectionDAG &DAG) const {
20100 // If the subtarget is not 64bit, we may need the global base reg
20101 // after isel expand pseudo, i.e., after CGBR pass ran.
20102 // Therefore, ask for the GlobalBaseReg now, so that the pass
20103 // inserts the code for us in case we need it.
20104 // Otherwise, we will end up in a situation where we will
20105 // reference a virtual register that is not defined!
20106 if (!Subtarget.is64Bit()) {
20107 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20108 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20110 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20111 DAG.getVTList(MVT::i32, MVT::Other),
20112 Op.getOperand(0), Op.getOperand(1));
20115 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20116 SelectionDAG &DAG) const {
20118 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20119 Op.getOperand(0), Op.getOperand(1));
20122 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20123 SelectionDAG &DAG) const {
20125 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20129 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20130 return Op.getOperand(0);
20133 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20134 SelectionDAG &DAG) const {
20135 SDValue Root = Op.getOperand(0);
20136 SDValue Trmp = Op.getOperand(1); // trampoline
20137 SDValue FPtr = Op.getOperand(2); // nested function
20138 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20141 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20142 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20144 if (Subtarget.is64Bit()) {
20145 SDValue OutChains[6];
20147 // Large code-model.
20148 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20149 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20151 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20152 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20154 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20156 // Load the pointer to the nested function into R11.
20157 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20158 SDValue Addr = Trmp;
20159 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20160 Addr, MachinePointerInfo(TrmpAddr));
20162 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20163 DAG.getConstant(2, dl, MVT::i64));
20165 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20166 /* Alignment = */ 2);
20168 // Load the 'nest' parameter value into R10.
20169 // R10 is specified in X86CallingConv.td
20170 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20171 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20172 DAG.getConstant(10, dl, MVT::i64));
20173 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20174 Addr, MachinePointerInfo(TrmpAddr, 10));
20176 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20177 DAG.getConstant(12, dl, MVT::i64));
20179 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20180 /* Alignment = */ 2);
20182 // Jump to the nested function.
20183 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20184 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20185 DAG.getConstant(20, dl, MVT::i64));
20186 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20187 Addr, MachinePointerInfo(TrmpAddr, 20));
20189 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20190 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20191 DAG.getConstant(22, dl, MVT::i64));
20192 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20193 Addr, MachinePointerInfo(TrmpAddr, 22));
20195 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20197 const Function *Func =
20198 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20199 CallingConv::ID CC = Func->getCallingConv();
20204 llvm_unreachable("Unsupported calling convention");
20205 case CallingConv::C:
20206 case CallingConv::X86_StdCall: {
20207 // Pass 'nest' parameter in ECX.
20208 // Must be kept in sync with X86CallingConv.td
20209 NestReg = X86::ECX;
20211 // Check that ECX wasn't needed by an 'inreg' parameter.
20212 FunctionType *FTy = Func->getFunctionType();
20213 const AttributeSet &Attrs = Func->getAttributes();
20215 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20216 unsigned InRegCount = 0;
20219 for (FunctionType::param_iterator I = FTy->param_begin(),
20220 E = FTy->param_end(); I != E; ++I, ++Idx)
20221 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20222 auto &DL = DAG.getDataLayout();
20223 // FIXME: should only count parameters that are lowered to integers.
20224 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20227 if (InRegCount > 2) {
20228 report_fatal_error("Nest register in use - reduce number of inreg"
20234 case CallingConv::X86_FastCall:
20235 case CallingConv::X86_ThisCall:
20236 case CallingConv::Fast:
20237 // Pass 'nest' parameter in EAX.
20238 // Must be kept in sync with X86CallingConv.td
20239 NestReg = X86::EAX;
20243 SDValue OutChains[4];
20244 SDValue Addr, Disp;
20246 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20247 DAG.getConstant(10, dl, MVT::i32));
20248 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20250 // This is storing the opcode for MOV32ri.
20251 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20252 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20254 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20255 Trmp, MachinePointerInfo(TrmpAddr));
20257 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20258 DAG.getConstant(1, dl, MVT::i32));
20260 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20261 /* Alignment = */ 1);
20263 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20264 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20265 DAG.getConstant(5, dl, MVT::i32));
20266 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20267 Addr, MachinePointerInfo(TrmpAddr, 5),
20268 /* Alignment = */ 1);
20270 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20271 DAG.getConstant(6, dl, MVT::i32));
20273 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20274 /* Alignment = */ 1);
20276 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20280 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20281 SelectionDAG &DAG) const {
20283 The rounding mode is in bits 11:10 of FPSR, and has the following
20285 00 Round to nearest
20290 FLT_ROUNDS, on the other hand, expects the following:
20297 To perform the conversion, we do:
20298 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20301 MachineFunction &MF = DAG.getMachineFunction();
20302 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20303 unsigned StackAlignment = TFI.getStackAlignment();
20304 MVT VT = Op.getSimpleValueType();
20307 // Save FP Control Word to stack slot
20308 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20309 SDValue StackSlot =
20310 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20312 MachineMemOperand *MMO =
20313 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20314 MachineMemOperand::MOStore, 2, 2);
20316 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20317 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20318 DAG.getVTList(MVT::Other),
20319 Ops, MVT::i16, MMO);
20321 // Load FP Control Word from stack slot
20323 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20325 // Transform as necessary
20327 DAG.getNode(ISD::SRL, DL, MVT::i16,
20328 DAG.getNode(ISD::AND, DL, MVT::i16,
20329 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20330 DAG.getConstant(11, DL, MVT::i8));
20332 DAG.getNode(ISD::SRL, DL, MVT::i16,
20333 DAG.getNode(ISD::AND, DL, MVT::i16,
20334 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20335 DAG.getConstant(9, DL, MVT::i8));
20338 DAG.getNode(ISD::AND, DL, MVT::i16,
20339 DAG.getNode(ISD::ADD, DL, MVT::i16,
20340 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20341 DAG.getConstant(1, DL, MVT::i16)),
20342 DAG.getConstant(3, DL, MVT::i16));
20344 return DAG.getNode((VT.getSizeInBits() < 16 ?
20345 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20348 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20350 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
20351 // to 512-bit vector.
20352 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
20353 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20354 // split the vector, perform operation on it's Lo a Hi part and
20355 // concatenate the results.
20356 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
20357 assert(Op.getOpcode() == ISD::CTLZ);
20359 MVT VT = Op.getSimpleValueType();
20360 MVT EltVT = VT.getVectorElementType();
20361 unsigned NumElems = VT.getVectorNumElements();
20363 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
20364 // Extend to 512 bit vector.
20365 assert((VT.is256BitVector() || VT.is128BitVector()) &&
20366 "Unsupported value type for operation");
20368 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
20369 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
20370 DAG.getUNDEF(NewVT),
20372 DAG.getIntPtrConstant(0, dl));
20373 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
20375 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
20376 DAG.getIntPtrConstant(0, dl));
20379 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
20380 "Unsupported element type");
20382 if (16 < NumElems) {
20383 // Split vector, it's Lo and Hi parts will be handled in next iteration.
20385 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
20386 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
20388 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
20389 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
20391 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
20394 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
20396 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
20397 "Unsupported value type for operation");
20399 // Use native supported vector instruction vplzcntd.
20400 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
20401 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
20402 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
20403 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
20405 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
20408 // Lower CTLZ using a PSHUFB lookup table implementation.
20409 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
20410 const X86Subtarget &Subtarget,
20411 SelectionDAG &DAG) {
20412 MVT VT = Op.getSimpleValueType();
20413 int NumElts = VT.getVectorNumElements();
20414 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
20415 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
20417 // Per-nibble leading zero PSHUFB lookup table.
20418 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
20419 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
20420 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
20421 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
20423 SmallVector<SDValue, 64> LUTVec;
20424 for (int i = 0; i < NumBytes; ++i)
20425 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20426 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
20428 // Begin by bitcasting the input to byte vector, then split those bytes
20429 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
20430 // If the hi input nibble is zero then we add both results together, otherwise
20431 // we just take the hi result (by masking the lo result to zero before the
20433 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
20434 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
20436 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
20437 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
20438 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
20439 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
20440 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
20442 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
20443 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
20444 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
20445 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
20447 // Merge result back from vXi8 back to VT, working on the lo/hi halves
20448 // of the current vector width in the same way we did for the nibbles.
20449 // If the upper half of the input element is zero then add the halves'
20450 // leading zero counts together, otherwise just use the upper half's.
20451 // Double the width of the result until we are at target width.
20452 while (CurrVT != VT) {
20453 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
20454 int CurrNumElts = CurrVT.getVectorNumElements();
20455 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
20456 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
20457 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
20459 // Check if the upper half of the input element is zero.
20460 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
20461 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
20462 HiZ = DAG.getBitcast(NextVT, HiZ);
20464 // Move the upper/lower halves to the lower bits as we'll be extending to
20465 // NextVT. Mask the lower result to zero if HiZ is true and add the results
20467 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
20468 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
20469 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
20470 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
20471 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
20478 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
20479 const X86Subtarget &Subtarget,
20480 SelectionDAG &DAG) {
20481 MVT VT = Op.getSimpleValueType();
20482 SDValue Op0 = Op.getOperand(0);
20484 if (Subtarget.hasAVX512())
20485 return LowerVectorCTLZ_AVX512(Op, DAG);
20487 // Decompose 256-bit ops into smaller 128-bit ops.
20488 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20489 unsigned NumElems = VT.getVectorNumElements();
20491 // Extract each 128-bit vector, perform ctlz and concat the result.
20492 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
20493 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
20495 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
20496 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
20497 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
20500 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
20501 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
20504 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
20505 SelectionDAG &DAG) {
20506 MVT VT = Op.getSimpleValueType();
20508 unsigned NumBits = VT.getSizeInBits();
20510 unsigned Opc = Op.getOpcode();
20513 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
20515 Op = Op.getOperand(0);
20516 if (VT == MVT::i8) {
20517 // Zero extend to i32 since there is not an i8 bsr.
20519 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
20522 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
20523 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
20524 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
20526 if (Opc == ISD::CTLZ) {
20527 // If src is zero (i.e. bsr sets ZF), returns NumBits.
20530 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
20531 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20534 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
20537 // Finally xor with NumBits-1.
20538 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
20539 DAG.getConstant(NumBits - 1, dl, OpVT));
20542 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
20546 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
20547 MVT VT = Op.getSimpleValueType();
20548 unsigned NumBits = VT.getScalarSizeInBits();
20551 if (VT.isVector()) {
20552 SDValue N0 = Op.getOperand(0);
20553 SDValue Zero = DAG.getConstant(0, dl, VT);
20555 // lsb(x) = (x & -x)
20556 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
20557 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
20559 // cttz_undef(x) = (width - 1) - ctlz(lsb)
20560 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
20561 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
20562 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
20563 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
20566 // cttz(x) = ctpop(lsb - 1)
20567 SDValue One = DAG.getConstant(1, dl, VT);
20568 return DAG.getNode(ISD::CTPOP, dl, VT,
20569 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
20572 assert(Op.getOpcode() == ISD::CTTZ &&
20573 "Only scalar CTTZ requires custom lowering");
20575 // Issue a bsf (scan bits forward) which also sets EFLAGS.
20576 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
20577 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
20579 // If src is zero (i.e. bsf sets ZF), returns NumBits.
20582 DAG.getConstant(NumBits, dl, VT),
20583 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20586 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
20589 /// Break a 256-bit integer operation into two new 128-bit ones and then
20590 /// concatenate the result back.
20591 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
20592 MVT VT = Op.getSimpleValueType();
20594 assert(VT.is256BitVector() && VT.isInteger() &&
20595 "Unsupported value type for operation");
20597 unsigned NumElems = VT.getVectorNumElements();
20600 // Extract the LHS vectors
20601 SDValue LHS = Op.getOperand(0);
20602 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
20603 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
20605 // Extract the RHS vectors
20606 SDValue RHS = Op.getOperand(1);
20607 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
20608 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
20610 MVT EltVT = VT.getVectorElementType();
20611 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20613 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20614 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20615 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20618 /// Break a 512-bit integer operation into two new 256-bit ones and then
20619 /// concatenate the result back.
20620 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
20621 MVT VT = Op.getSimpleValueType();
20623 assert(VT.is512BitVector() && VT.isInteger() &&
20624 "Unsupported value type for operation");
20626 unsigned NumElems = VT.getVectorNumElements();
20629 // Extract the LHS vectors
20630 SDValue LHS = Op.getOperand(0);
20631 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
20632 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
20634 // Extract the RHS vectors
20635 SDValue RHS = Op.getOperand(1);
20636 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
20637 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
20639 MVT EltVT = VT.getVectorElementType();
20640 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20642 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20643 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20644 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20647 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
20648 if (Op.getValueType() == MVT::i1)
20649 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20650 Op.getOperand(0), Op.getOperand(1));
20651 assert(Op.getSimpleValueType().is256BitVector() &&
20652 Op.getSimpleValueType().isInteger() &&
20653 "Only handle AVX 256-bit vector integer operation");
20654 return Lower256IntArith(Op, DAG);
20657 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
20658 if (Op.getValueType() == MVT::i1)
20659 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20660 Op.getOperand(0), Op.getOperand(1));
20661 assert(Op.getSimpleValueType().is256BitVector() &&
20662 Op.getSimpleValueType().isInteger() &&
20663 "Only handle AVX 256-bit vector integer operation");
20664 return Lower256IntArith(Op, DAG);
20667 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
20668 assert(Op.getSimpleValueType().is256BitVector() &&
20669 Op.getSimpleValueType().isInteger() &&
20670 "Only handle AVX 256-bit vector integer operation");
20671 return Lower256IntArith(Op, DAG);
20674 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
20675 SelectionDAG &DAG) {
20677 MVT VT = Op.getSimpleValueType();
20680 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
20682 // Decompose 256-bit ops into smaller 128-bit ops.
20683 if (VT.is256BitVector() && !Subtarget.hasInt256())
20684 return Lower256IntArith(Op, DAG);
20686 SDValue A = Op.getOperand(0);
20687 SDValue B = Op.getOperand(1);
20689 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
20690 // vector pairs, multiply and truncate.
20691 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
20692 if (Subtarget.hasInt256()) {
20693 // For 512-bit vectors, split into 256-bit vectors to allow the
20694 // sign-extension to occur.
20695 if (VT == MVT::v64i8)
20696 return Lower512IntArith(Op, DAG);
20698 // For 256-bit vectors, split into 128-bit vectors to allow the
20699 // sign-extension to occur. We don't need this on AVX512BW as we can
20700 // safely sign-extend to v32i16.
20701 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
20702 return Lower256IntArith(Op, DAG);
20704 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
20705 return DAG.getNode(
20706 ISD::TRUNCATE, dl, VT,
20707 DAG.getNode(ISD::MUL, dl, ExVT,
20708 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
20709 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
20712 assert(VT == MVT::v16i8 &&
20713 "Pre-AVX2 support only supports v16i8 multiplication");
20714 MVT ExVT = MVT::v8i16;
20716 // Extract the lo parts and sign extend to i16
20718 if (Subtarget.hasSSE41()) {
20719 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
20720 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
20722 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20723 -1, 4, -1, 5, -1, 6, -1, 7};
20724 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20725 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20726 ALo = DAG.getBitcast(ExVT, ALo);
20727 BLo = DAG.getBitcast(ExVT, BLo);
20728 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20729 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20732 // Extract the hi parts and sign extend to i16
20734 if (Subtarget.hasSSE41()) {
20735 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20736 -1, -1, -1, -1, -1, -1, -1, -1};
20737 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20738 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20739 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
20740 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
20742 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20743 -1, 12, -1, 13, -1, 14, -1, 15};
20744 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20745 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20746 AHi = DAG.getBitcast(ExVT, AHi);
20747 BHi = DAG.getBitcast(ExVT, BHi);
20748 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20749 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20752 // Multiply, mask the lower 8bits of the lo/hi results and pack
20753 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20754 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20755 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
20756 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
20757 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20760 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
20761 if (VT == MVT::v4i32) {
20762 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
20763 "Should not custom lower when pmuldq is available!");
20765 // Extract the odd parts.
20766 static const int UnpackMask[] = { 1, -1, 3, -1 };
20767 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
20768 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
20770 // Multiply the even parts.
20771 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
20772 // Now multiply odd parts.
20773 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
20775 Evens = DAG.getBitcast(VT, Evens);
20776 Odds = DAG.getBitcast(VT, Odds);
20778 // Merge the two vectors back together with a shuffle. This expands into 2
20780 static const int ShufMask[] = { 0, 4, 2, 6 };
20781 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
20784 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
20785 "Only know how to lower V2I64/V4I64/V8I64 multiply");
20787 // 32-bit vector types used for MULDQ/MULUDQ.
20788 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20790 // MULDQ returns the 64-bit result of the signed multiplication of the lower
20791 // 32-bits. We can lower with this if the sign bits stretch that far.
20792 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
20793 DAG.ComputeNumSignBits(B) > 32) {
20794 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
20795 DAG.getBitcast(MulVT, B));
20798 // Ahi = psrlqi(a, 32);
20799 // Bhi = psrlqi(b, 32);
20801 // AloBlo = pmuludq(a, b);
20802 // AloBhi = pmuludq(a, Bhi);
20803 // AhiBlo = pmuludq(Ahi, b);
20805 // Hi = psllqi(AloBhi + AhiBlo, 32);
20806 // return AloBlo + Hi;
20807 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
20808 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
20809 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
20811 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
20812 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
20813 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
20815 // Bit cast to 32-bit vectors for MULUDQ.
20816 SDValue Alo = DAG.getBitcast(MulVT, A);
20817 SDValue Blo = DAG.getBitcast(MulVT, B);
20819 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20821 // Only multiply lo/hi halves that aren't known to be zero.
20822 SDValue AloBlo = Zero;
20823 if (!ALoIsZero && !BLoIsZero)
20824 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
20826 SDValue AloBhi = Zero;
20827 if (!ALoIsZero && !BHiIsZero) {
20828 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
20829 Bhi = DAG.getBitcast(MulVT, Bhi);
20830 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
20833 SDValue AhiBlo = Zero;
20834 if (!AHiIsZero && !BLoIsZero) {
20835 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
20836 Ahi = DAG.getBitcast(MulVT, Ahi);
20837 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
20840 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
20841 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
20843 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
20846 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
20847 SelectionDAG &DAG) {
20849 MVT VT = Op.getSimpleValueType();
20851 // Decompose 256-bit ops into smaller 128-bit ops.
20852 if (VT.is256BitVector() && !Subtarget.hasInt256())
20853 return Lower256IntArith(Op, DAG);
20855 // Only i8 vectors should need custom lowering after this.
20856 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
20857 "Unsupported vector type");
20859 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
20860 // logical shift down the upper half and pack back to i8.
20861 SDValue A = Op.getOperand(0);
20862 SDValue B = Op.getOperand(1);
20864 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
20865 // and then ashr/lshr the upper bits down to the lower bits before multiply.
20866 unsigned Opcode = Op.getOpcode();
20867 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
20868 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
20870 // AVX2 implementations - extend xmm subvectors to ymm.
20871 if (Subtarget.hasInt256()) {
20872 SDValue Lo = DAG.getIntPtrConstant(0, dl);
20873 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
20875 if (VT == MVT::v32i8) {
20876 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
20877 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
20878 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
20879 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
20880 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
20881 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
20882 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
20883 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
20884 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20885 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
20886 DAG.getConstant(8, dl, MVT::v16i16));
20887 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20888 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
20889 DAG.getConstant(8, dl, MVT::v16i16));
20890 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
20891 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
20892 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
20893 16, 17, 18, 19, 20, 21, 22, 23};
20894 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20895 24, 25, 26, 27, 28, 29, 30, 31};
20896 return DAG.getNode(X86ISD::PACKUS, dl, VT,
20897 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
20898 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
20901 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
20902 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
20903 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
20904 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
20905 DAG.getConstant(8, dl, MVT::v16i16));
20906 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
20907 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
20908 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20911 assert(VT == MVT::v16i8 &&
20912 "Pre-AVX2 support only supports v16i8 multiplication");
20913 MVT ExVT = MVT::v8i16;
20915 // Extract the lo parts and zero/sign extend to i16.
20917 if (Subtarget.hasSSE41()) {
20918 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
20919 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
20921 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20922 -1, 4, -1, 5, -1, 6, -1, 7};
20923 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20924 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20925 ALo = DAG.getBitcast(ExVT, ALo);
20926 BLo = DAG.getBitcast(ExVT, BLo);
20927 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20928 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20931 // Extract the hi parts and zero/sign extend to i16.
20933 if (Subtarget.hasSSE41()) {
20934 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20935 -1, -1, -1, -1, -1, -1, -1, -1};
20936 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20937 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20938 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
20939 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
20941 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20942 -1, 12, -1, 13, -1, 14, -1, 15};
20943 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20944 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20945 AHi = DAG.getBitcast(ExVT, AHi);
20946 BHi = DAG.getBitcast(ExVT, BHi);
20947 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20948 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20951 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
20952 // pack back to v16i8.
20953 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20954 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20955 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
20956 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
20957 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20960 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
20961 assert(Subtarget.isTargetWin64() && "Unexpected target");
20962 EVT VT = Op.getValueType();
20963 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
20964 "Unexpected return type for lowering");
20968 switch (Op->getOpcode()) {
20969 default: llvm_unreachable("Unexpected request for libcall!");
20970 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
20971 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
20972 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
20973 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
20974 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
20975 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
20979 SDValue InChain = DAG.getEntryNode();
20981 TargetLowering::ArgListTy Args;
20982 TargetLowering::ArgListEntry Entry;
20983 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
20984 EVT ArgVT = Op->getOperand(i).getValueType();
20985 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
20986 "Unexpected argument type for lowering");
20987 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
20988 Entry.Node = StackPtr;
20989 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
20990 MachinePointerInfo(), /* Alignment = */ 16);
20991 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20992 Entry.Ty = PointerType::get(ArgTy,0);
20993 Entry.isSExt = false;
20994 Entry.isZExt = false;
20995 Args.push_back(Entry);
20998 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
20999 getPointerTy(DAG.getDataLayout()));
21001 TargetLowering::CallLoweringInfo CLI(DAG);
21002 CLI.setDebugLoc(dl).setChain(InChain)
21003 .setCallee(getLibcallCallingConv(LC),
21004 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
21005 Callee, std::move(Args))
21006 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
21008 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21009 return DAG.getBitcast(VT, CallInfo.first);
21012 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21013 SelectionDAG &DAG) {
21014 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21015 MVT VT = Op0.getSimpleValueType();
21018 // Decompose 256-bit ops into smaller 128-bit ops.
21019 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21020 unsigned Opcode = Op.getOpcode();
21021 unsigned NumElems = VT.getVectorNumElements();
21022 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21023 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21024 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21025 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21026 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21027 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21028 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21030 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21031 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21033 return DAG.getMergeValues(Ops, dl);
21036 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21037 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21039 // PMULxD operations multiply each even value (starting at 0) of LHS with
21040 // the related value of RHS and produce a widen result.
21041 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21042 // => <2 x i64> <ae|cg>
21044 // In other word, to have all the results, we need to perform two PMULxD:
21045 // 1. one with the even values.
21046 // 2. one with the odd values.
21047 // To achieve #2, with need to place the odd values at an even position.
21049 // Place the odd value at an even position (basically, shift all values 1
21050 // step to the left):
21051 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21052 // <a|b|c|d> => <b|undef|d|undef>
21053 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21054 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21055 // <e|f|g|h> => <f|undef|h|undef>
21056 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21057 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21059 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21061 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21062 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21064 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21065 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21066 // => <2 x i64> <ae|cg>
21067 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21068 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21069 // => <2 x i64> <bf|dh>
21070 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21072 // Shuffle it back into the right order.
21073 SDValue Highs, Lows;
21074 if (VT == MVT::v8i32) {
21075 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21076 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21077 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21078 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21080 const int HighMask[] = {1, 5, 3, 7};
21081 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21082 const int LowMask[] = {0, 4, 2, 6};
21083 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21086 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21087 // unsigned multiply.
21088 if (IsSigned && !Subtarget.hasSSE41()) {
21089 SDValue ShAmt = DAG.getConstant(
21091 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21092 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21093 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21094 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21095 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21097 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21098 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21101 // The first result of MUL_LOHI is actually the low value, followed by the
21103 SDValue Ops[] = {Lows, Highs};
21104 return DAG.getMergeValues(Ops, dl);
21107 // Return true if the required (according to Opcode) shift-imm form is natively
21108 // supported by the Subtarget
21109 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21111 if (VT.getScalarSizeInBits() < 16)
21114 if (VT.is512BitVector() &&
21115 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21118 bool LShift = VT.is128BitVector() ||
21119 (VT.is256BitVector() && Subtarget.hasInt256());
21121 bool AShift = LShift && (Subtarget.hasVLX() ||
21122 (VT != MVT::v2i64 && VT != MVT::v4i64));
21123 return (Opcode == ISD::SRA) ? AShift : LShift;
21126 // The shift amount is a variable, but it is the same for all vector lanes.
21127 // These instructions are defined together with shift-immediate.
21129 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21131 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21134 // Return true if the required (according to Opcode) variable-shift form is
21135 // natively supported by the Subtarget
21136 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21139 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21142 // vXi16 supported only on AVX-512, BWI
21143 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21146 if (VT.is512BitVector() || Subtarget.hasVLX())
21149 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21150 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21151 return (Opcode == ISD::SRA) ? AShift : LShift;
21154 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21155 const X86Subtarget &Subtarget) {
21156 MVT VT = Op.getSimpleValueType();
21158 SDValue R = Op.getOperand(0);
21159 SDValue Amt = Op.getOperand(1);
21161 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21162 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21164 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21165 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21166 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21167 SDValue Ex = DAG.getBitcast(ExVT, R);
21169 if (ShiftAmt >= 32) {
21170 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21172 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21173 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21174 ShiftAmt - 32, DAG);
21175 if (VT == MVT::v2i64)
21176 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21177 if (VT == MVT::v4i64)
21178 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21179 {9, 1, 11, 3, 13, 5, 15, 7});
21181 // SRA upper i32, SHL whole i64 and select lower i32.
21182 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21185 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21186 Lower = DAG.getBitcast(ExVT, Lower);
21187 if (VT == MVT::v2i64)
21188 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21189 if (VT == MVT::v4i64)
21190 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21191 {8, 1, 10, 3, 12, 5, 14, 7});
21193 return DAG.getBitcast(VT, Ex);
21196 // Optimize shl/srl/sra with constant shift amount.
21197 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21198 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21199 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21201 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21202 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21204 // i64 SRA needs to be performed as partial shifts.
21205 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21206 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21207 return ArithmeticShiftRight64(ShiftAmt);
21209 if (VT == MVT::v16i8 ||
21210 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21211 VT == MVT::v64i8) {
21212 unsigned NumElts = VT.getVectorNumElements();
21213 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21215 // Simple i8 add case
21216 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21217 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21219 // ashr(R, 7) === cmp_slt(R, 0)
21220 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21221 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21222 if (VT.is512BitVector()) {
21223 assert(VT == MVT::v64i8 && "Unexpected element type!");
21224 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21225 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21227 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21230 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21231 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21234 if (Op.getOpcode() == ISD::SHL) {
21235 // Make a large shift.
21236 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21238 SHL = DAG.getBitcast(VT, SHL);
21239 // Zero out the rightmost bits.
21240 return DAG.getNode(ISD::AND, dl, VT, SHL,
21241 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21243 if (Op.getOpcode() == ISD::SRL) {
21244 // Make a large shift.
21245 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21247 SRL = DAG.getBitcast(VT, SRL);
21248 // Zero out the leftmost bits.
21249 return DAG.getNode(ISD::AND, dl, VT, SRL,
21250 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21252 if (Op.getOpcode() == ISD::SRA) {
21253 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21254 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21256 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21257 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21258 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21261 llvm_unreachable("Unknown shift opcode.");
21266 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21267 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21268 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21269 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21271 // Peek through any splat that was introduced for i64 shift vectorization.
21272 int SplatIndex = -1;
21273 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21274 if (SVN->isSplat()) {
21275 SplatIndex = SVN->getSplatIndex();
21276 Amt = Amt.getOperand(0);
21277 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21278 "Splat shuffle referencing second operand");
21281 if (Amt.getOpcode() != ISD::BITCAST ||
21282 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21285 Amt = Amt.getOperand(0);
21286 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21287 VT.getVectorNumElements();
21288 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21289 uint64_t ShiftAmt = 0;
21290 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21291 for (unsigned i = 0; i != Ratio; ++i) {
21292 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21296 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21299 // Check remaining shift amounts (if not a splat).
21300 if (SplatIndex < 0) {
21301 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21302 uint64_t ShAmt = 0;
21303 for (unsigned j = 0; j != Ratio; ++j) {
21304 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21308 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21310 if (ShAmt != ShiftAmt)
21315 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21316 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21318 if (Op.getOpcode() == ISD::SRA)
21319 return ArithmeticShiftRight64(ShiftAmt);
21325 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21326 const X86Subtarget &Subtarget) {
21327 MVT VT = Op.getSimpleValueType();
21329 SDValue R = Op.getOperand(0);
21330 SDValue Amt = Op.getOperand(1);
21332 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21333 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21335 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21336 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21338 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21340 MVT EltVT = VT.getVectorElementType();
21342 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21343 // Check if this build_vector node is doing a splat.
21344 // If so, then set BaseShAmt equal to the splat value.
21345 BaseShAmt = BV->getSplatValue();
21346 if (BaseShAmt && BaseShAmt.isUndef())
21347 BaseShAmt = SDValue();
21349 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21350 Amt = Amt.getOperand(0);
21352 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21353 if (SVN && SVN->isSplat()) {
21354 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21355 SDValue InVec = Amt.getOperand(0);
21356 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21357 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21358 "Unexpected shuffle index found!");
21359 BaseShAmt = InVec.getOperand(SplatIdx);
21360 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21361 if (ConstantSDNode *C =
21362 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21363 if (C->getZExtValue() == SplatIdx)
21364 BaseShAmt = InVec.getOperand(1);
21369 // Avoid introducing an extract element from a shuffle.
21370 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21371 DAG.getIntPtrConstant(SplatIdx, dl));
21375 if (BaseShAmt.getNode()) {
21376 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
21377 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
21378 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
21379 else if (EltVT.bitsLT(MVT::i32))
21380 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
21382 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
21386 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21387 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
21388 Amt.getOpcode() == ISD::BITCAST &&
21389 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
21390 Amt = Amt.getOperand(0);
21391 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21392 VT.getVectorNumElements();
21393 std::vector<SDValue> Vals(Ratio);
21394 for (unsigned i = 0; i != Ratio; ++i)
21395 Vals[i] = Amt.getOperand(i);
21396 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21397 for (unsigned j = 0; j != Ratio; ++j)
21398 if (Vals[j] != Amt.getOperand(i + j))
21402 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
21403 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
21408 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
21409 SelectionDAG &DAG) {
21410 MVT VT = Op.getSimpleValueType();
21412 SDValue R = Op.getOperand(0);
21413 SDValue Amt = Op.getOperand(1);
21414 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21416 assert(VT.isVector() && "Custom lowering only for vector shifts!");
21417 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
21419 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
21422 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
21425 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
21428 // XOP has 128-bit variable logical/arithmetic shifts.
21429 // +ve/-ve Amt = shift left/right.
21430 if (Subtarget.hasXOP() &&
21431 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
21432 VT == MVT::v8i16 || VT == MVT::v16i8)) {
21433 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
21434 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21435 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
21437 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
21438 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
21439 if (Op.getOpcode() == ISD::SRA)
21440 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
21443 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
21444 // shifts per-lane and then shuffle the partial results back together.
21445 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
21446 // Splat the shift amounts so the scalar shifts above will catch it.
21447 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
21448 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
21449 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
21450 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
21451 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
21454 // i64 vector arithmetic shift can be emulated with the transform:
21455 // M = lshr(SIGN_BIT, Amt)
21456 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
21457 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
21458 Op.getOpcode() == ISD::SRA) {
21459 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
21460 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
21461 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21462 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
21463 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
21467 // If possible, lower this packed shift into a vector multiply instead of
21468 // expanding it into a sequence of scalar shifts.
21469 // Do this only if the vector shift count is a constant build_vector.
21470 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
21471 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
21472 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
21473 SmallVector<SDValue, 8> Elts;
21474 MVT SVT = VT.getVectorElementType();
21475 unsigned SVTBits = SVT.getSizeInBits();
21476 APInt One(SVTBits, 1);
21477 unsigned NumElems = VT.getVectorNumElements();
21479 for (unsigned i=0; i !=NumElems; ++i) {
21480 SDValue Op = Amt->getOperand(i);
21481 if (Op->isUndef()) {
21482 Elts.push_back(Op);
21486 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
21487 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
21488 uint64_t ShAmt = C.getZExtValue();
21489 if (ShAmt >= SVTBits) {
21490 Elts.push_back(DAG.getUNDEF(SVT));
21493 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
21495 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
21496 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
21499 // Lower SHL with variable shift amount.
21500 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
21501 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
21503 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
21504 DAG.getConstant(0x3f800000U, dl, VT));
21505 Op = DAG.getBitcast(MVT::v4f32, Op);
21506 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
21507 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
21510 // If possible, lower this shift as a sequence of two shifts by
21511 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
21513 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
21515 // Could be rewritten as:
21516 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
21518 // The advantage is that the two shifts from the example would be
21519 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
21520 // the vector shift into four scalar shifts plus four pairs of vector
21522 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
21523 unsigned TargetOpcode = X86ISD::MOVSS;
21524 bool CanBeSimplified;
21525 // The splat value for the first packed shift (the 'X' from the example).
21526 SDValue Amt1 = Amt->getOperand(0);
21527 // The splat value for the second packed shift (the 'Y' from the example).
21528 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
21530 // See if it is possible to replace this node with a sequence of
21531 // two shifts followed by a MOVSS/MOVSD/PBLEND.
21532 if (VT == MVT::v4i32) {
21533 // Check if it is legal to use a MOVSS.
21534 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
21535 Amt2 == Amt->getOperand(3);
21536 if (!CanBeSimplified) {
21537 // Otherwise, check if we can still simplify this node using a MOVSD.
21538 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
21539 Amt->getOperand(2) == Amt->getOperand(3);
21540 TargetOpcode = X86ISD::MOVSD;
21541 Amt2 = Amt->getOperand(2);
21544 // Do similar checks for the case where the machine value type
21546 CanBeSimplified = Amt1 == Amt->getOperand(1);
21547 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
21548 CanBeSimplified = Amt2 == Amt->getOperand(i);
21550 if (!CanBeSimplified) {
21551 TargetOpcode = X86ISD::MOVSD;
21552 CanBeSimplified = true;
21553 Amt2 = Amt->getOperand(4);
21554 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
21555 CanBeSimplified = Amt1 == Amt->getOperand(i);
21556 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
21557 CanBeSimplified = Amt2 == Amt->getOperand(j);
21561 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
21562 isa<ConstantSDNode>(Amt2)) {
21563 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
21564 MVT CastVT = MVT::v4i32;
21566 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
21567 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
21569 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
21570 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
21571 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
21572 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
21573 if (TargetOpcode == X86ISD::MOVSD)
21574 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21575 BitCast2, {0, 1, 6, 7}));
21576 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21577 BitCast2, {0, 5, 6, 7}));
21581 // v4i32 Non Uniform Shifts.
21582 // If the shift amount is constant we can shift each lane using the SSE2
21583 // immediate shifts, else we need to zero-extend each lane to the lower i64
21584 // and shift using the SSE2 variable shifts.
21585 // The separate results can then be blended together.
21586 if (VT == MVT::v4i32) {
21587 unsigned Opc = Op.getOpcode();
21588 SDValue Amt0, Amt1, Amt2, Amt3;
21590 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
21591 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
21592 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
21593 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
21595 // ISD::SHL is handled above but we include it here for completeness.
21598 llvm_unreachable("Unknown target vector shift node");
21600 Opc = X86ISD::VSHL;
21603 Opc = X86ISD::VSRL;
21606 Opc = X86ISD::VSRA;
21609 // The SSE2 shifts use the lower i64 as the same shift amount for
21610 // all lanes and the upper i64 is ignored. These shuffle masks
21611 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
21612 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21613 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
21614 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
21615 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
21616 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
21619 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
21620 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
21621 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
21622 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
21623 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
21624 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
21625 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
21628 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
21629 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
21630 // make the existing SSE solution better.
21631 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
21632 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
21633 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
21634 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
21635 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
21636 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
21638 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21639 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
21640 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
21641 return DAG.getNode(ISD::TRUNCATE, dl, VT,
21642 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
21645 if (VT == MVT::v16i8 ||
21646 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
21647 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
21648 unsigned ShiftOpcode = Op->getOpcode();
21650 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
21651 // On SSE41 targets we make use of the fact that VSELECT lowers
21652 // to PBLENDVB which selects bytes based just on the sign bit.
21653 if (Subtarget.hasSSE41()) {
21654 V0 = DAG.getBitcast(VT, V0);
21655 V1 = DAG.getBitcast(VT, V1);
21656 Sel = DAG.getBitcast(VT, Sel);
21657 return DAG.getBitcast(SelVT,
21658 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
21660 // On pre-SSE41 targets we test for the sign bit by comparing to
21661 // zero - a negative value will set all bits of the lanes to true
21662 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
21663 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
21664 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
21665 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
21668 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
21669 // We can safely do this using i16 shifts as we're only interested in
21670 // the 3 lower bits of each byte.
21671 Amt = DAG.getBitcast(ExtVT, Amt);
21672 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
21673 Amt = DAG.getBitcast(VT, Amt);
21675 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
21676 // r = VSELECT(r, shift(r, 4), a);
21678 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21679 R = SignBitSelect(VT, Amt, M, R);
21682 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21684 // r = VSELECT(r, shift(r, 2), a);
21685 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21686 R = SignBitSelect(VT, Amt, M, R);
21689 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21691 // return VSELECT(r, shift(r, 1), a);
21692 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21693 R = SignBitSelect(VT, Amt, M, R);
21697 if (Op->getOpcode() == ISD::SRA) {
21698 // For SRA we need to unpack each byte to the higher byte of a i16 vector
21699 // so we can correctly sign extend. We don't care what happens to the
21701 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
21702 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
21703 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
21704 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
21705 ALo = DAG.getBitcast(ExtVT, ALo);
21706 AHi = DAG.getBitcast(ExtVT, AHi);
21707 RLo = DAG.getBitcast(ExtVT, RLo);
21708 RHi = DAG.getBitcast(ExtVT, RHi);
21710 // r = VSELECT(r, shift(r, 4), a);
21711 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21712 DAG.getConstant(4, dl, ExtVT));
21713 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21714 DAG.getConstant(4, dl, ExtVT));
21715 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21716 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21719 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21720 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21722 // r = VSELECT(r, shift(r, 2), a);
21723 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21724 DAG.getConstant(2, dl, ExtVT));
21725 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21726 DAG.getConstant(2, dl, ExtVT));
21727 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21728 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21731 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21732 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21734 // r = VSELECT(r, shift(r, 1), a);
21735 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21736 DAG.getConstant(1, dl, ExtVT));
21737 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21738 DAG.getConstant(1, dl, ExtVT));
21739 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21740 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21742 // Logical shift the result back to the lower byte, leaving a zero upper
21744 // meaning that we can safely pack with PACKUSWB.
21746 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
21748 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
21749 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21753 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
21754 MVT ExtVT = MVT::v8i32;
21755 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21756 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
21757 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
21758 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
21759 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
21760 ALo = DAG.getBitcast(ExtVT, ALo);
21761 AHi = DAG.getBitcast(ExtVT, AHi);
21762 RLo = DAG.getBitcast(ExtVT, RLo);
21763 RHi = DAG.getBitcast(ExtVT, RHi);
21764 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
21765 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
21766 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
21767 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
21768 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21771 if (VT == MVT::v8i16) {
21772 unsigned ShiftOpcode = Op->getOpcode();
21774 // If we have a constant shift amount, the non-SSE41 path is best as
21775 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
21776 bool UseSSE41 = Subtarget.hasSSE41() &&
21777 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21779 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
21780 // On SSE41 targets we make use of the fact that VSELECT lowers
21781 // to PBLENDVB which selects bytes based just on the sign bit.
21783 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
21784 V0 = DAG.getBitcast(ExtVT, V0);
21785 V1 = DAG.getBitcast(ExtVT, V1);
21786 Sel = DAG.getBitcast(ExtVT, Sel);
21787 return DAG.getBitcast(
21788 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
21790 // On pre-SSE41 targets we splat the sign bit - a negative value will
21791 // set all bits of the lanes to true and VSELECT uses that in
21792 // its OR(AND(V0,C),AND(V1,~C)) lowering.
21794 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
21795 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
21798 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
21800 // On SSE41 targets we need to replicate the shift mask in both
21801 // bytes for PBLENDVB.
21804 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
21805 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
21807 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
21810 // r = VSELECT(r, shift(r, 8), a);
21811 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
21812 R = SignBitSelect(Amt, M, R);
21815 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21817 // r = VSELECT(r, shift(r, 4), a);
21818 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21819 R = SignBitSelect(Amt, M, R);
21822 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21824 // r = VSELECT(r, shift(r, 2), a);
21825 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21826 R = SignBitSelect(Amt, M, R);
21829 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21831 // return VSELECT(r, shift(r, 1), a);
21832 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21833 R = SignBitSelect(Amt, M, R);
21837 // Decompose 256-bit shifts into smaller 128-bit shifts.
21838 if (VT.is256BitVector())
21839 return Lower256IntArith(Op, DAG);
21844 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
21845 SelectionDAG &DAG) {
21846 MVT VT = Op.getSimpleValueType();
21848 SDValue R = Op.getOperand(0);
21849 SDValue Amt = Op.getOperand(1);
21851 assert(VT.isVector() && "Custom lowering only for vector rotates!");
21852 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
21853 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
21855 // XOP has 128-bit vector variable + immediate rotates.
21856 // +ve/-ve Amt = rotate left/right.
21858 // Split 256-bit integers.
21859 if (VT.is256BitVector())
21860 return Lower256IntArith(Op, DAG);
21862 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
21864 // Attempt to rotate by immediate.
21865 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21866 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
21867 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
21868 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
21869 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
21870 DAG.getConstant(RotateAmt, DL, MVT::i8));
21874 // Use general rotate by variable (per-element).
21875 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
21878 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
21879 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
21880 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
21881 // looks for this combo and may remove the "setcc" instruction if the "setcc"
21882 // has only one use.
21883 SDNode *N = Op.getNode();
21884 SDValue LHS = N->getOperand(0);
21885 SDValue RHS = N->getOperand(1);
21886 unsigned BaseOp = 0;
21887 X86::CondCode Cond;
21889 switch (Op.getOpcode()) {
21890 default: llvm_unreachable("Unknown ovf instruction!");
21892 // A subtract of one will be selected as a INC. Note that INC doesn't
21893 // set CF, so we can't do this for UADDO.
21894 if (isOneConstant(RHS)) {
21895 BaseOp = X86ISD::INC;
21896 Cond = X86::COND_O;
21899 BaseOp = X86ISD::ADD;
21900 Cond = X86::COND_O;
21903 BaseOp = X86ISD::ADD;
21904 Cond = X86::COND_B;
21907 // A subtract of one will be selected as a DEC. Note that DEC doesn't
21908 // set CF, so we can't do this for USUBO.
21909 if (isOneConstant(RHS)) {
21910 BaseOp = X86ISD::DEC;
21911 Cond = X86::COND_O;
21914 BaseOp = X86ISD::SUB;
21915 Cond = X86::COND_O;
21918 BaseOp = X86ISD::SUB;
21919 Cond = X86::COND_B;
21922 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
21923 Cond = X86::COND_O;
21925 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
21926 if (N->getValueType(0) == MVT::i8) {
21927 BaseOp = X86ISD::UMUL8;
21928 Cond = X86::COND_O;
21931 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
21933 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
21935 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
21937 if (N->getValueType(1) == MVT::i1)
21938 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21940 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21944 // Also sets EFLAGS.
21945 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
21946 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
21948 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
21950 if (N->getValueType(1) == MVT::i1)
21951 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21953 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21956 /// Returns true if the operand type is exactly twice the native width, and
21957 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
21958 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
21959 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
21960 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
21961 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
21964 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
21965 else if (OpWidth == 128)
21966 return Subtarget.hasCmpxchg16b();
21971 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
21972 return needsCmpXchgNb(SI->getValueOperand()->getType());
21975 // Note: this turns large loads into lock cmpxchg8b/16b.
21976 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
21977 TargetLowering::AtomicExpansionKind
21978 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
21979 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
21980 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
21981 : AtomicExpansionKind::None;
21984 TargetLowering::AtomicExpansionKind
21985 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
21986 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
21987 Type *MemType = AI->getType();
21989 // If the operand is too big, we must see if cmpxchg8/16b is available
21990 // and default to library calls otherwise.
21991 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
21992 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
21993 : AtomicExpansionKind::None;
21996 AtomicRMWInst::BinOp Op = AI->getOperation();
21999 llvm_unreachable("Unknown atomic operation");
22000 case AtomicRMWInst::Xchg:
22001 case AtomicRMWInst::Add:
22002 case AtomicRMWInst::Sub:
22003 // It's better to use xadd, xsub or xchg for these in all cases.
22004 return AtomicExpansionKind::None;
22005 case AtomicRMWInst::Or:
22006 case AtomicRMWInst::And:
22007 case AtomicRMWInst::Xor:
22008 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22009 // prefix to a normal instruction for these operations.
22010 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22011 : AtomicExpansionKind::None;
22012 case AtomicRMWInst::Nand:
22013 case AtomicRMWInst::Max:
22014 case AtomicRMWInst::Min:
22015 case AtomicRMWInst::UMax:
22016 case AtomicRMWInst::UMin:
22017 // These always require a non-trivial set of data operations on x86. We must
22018 // use a cmpxchg loop.
22019 return AtomicExpansionKind::CmpXChg;
22024 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22025 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22026 Type *MemType = AI->getType();
22027 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22028 // there is no benefit in turning such RMWs into loads, and it is actually
22029 // harmful as it introduces a mfence.
22030 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22033 auto Builder = IRBuilder<>(AI);
22034 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22035 auto SynchScope = AI->getSynchScope();
22036 // We must restrict the ordering to avoid generating loads with Release or
22037 // ReleaseAcquire orderings.
22038 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22039 auto Ptr = AI->getPointerOperand();
22041 // Before the load we need a fence. Here is an example lifted from
22042 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22045 // x.store(1, relaxed);
22046 // r1 = y.fetch_add(0, release);
22048 // y.fetch_add(42, acquire);
22049 // r2 = x.load(relaxed);
22050 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22051 // lowered to just a load without a fence. A mfence flushes the store buffer,
22052 // making the optimization clearly correct.
22053 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22054 // otherwise, we might be able to be more aggressive on relaxed idempotent
22055 // rmw. In practice, they do not look useful, so we don't try to be
22056 // especially clever.
22057 if (SynchScope == SingleThread)
22058 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22059 // the IR level, so we must wrap it in an intrinsic.
22062 if (!Subtarget.hasMFence())
22063 // FIXME: it might make sense to use a locked operation here but on a
22064 // different cache-line to prevent cache-line bouncing. In practice it
22065 // is probably a small win, and x86 processors without mfence are rare
22066 // enough that we do not bother.
22070 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22071 Builder.CreateCall(MFence, {});
22073 // Finally we can emit the atomic load.
22074 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22075 AI->getType()->getPrimitiveSizeInBits());
22076 Loaded->setAtomic(Order, SynchScope);
22077 AI->replaceAllUsesWith(Loaded);
22078 AI->eraseFromParent();
22082 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22083 SelectionDAG &DAG) {
22085 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22086 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22087 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22088 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22090 // The only fence that needs an instruction is a sequentially-consistent
22091 // cross-thread fence.
22092 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22093 FenceScope == CrossThread) {
22094 if (Subtarget.hasMFence())
22095 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22097 SDValue Chain = Op.getOperand(0);
22098 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22100 DAG.getRegister(X86::ESP, MVT::i32), // Base
22101 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22102 DAG.getRegister(0, MVT::i32), // Index
22103 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22104 DAG.getRegister(0, MVT::i32), // Segment.
22108 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22109 return SDValue(Res, 0);
22112 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22113 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22116 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22117 SelectionDAG &DAG) {
22118 MVT T = Op.getSimpleValueType();
22122 switch(T.SimpleTy) {
22123 default: llvm_unreachable("Invalid value type!");
22124 case MVT::i8: Reg = X86::AL; size = 1; break;
22125 case MVT::i16: Reg = X86::AX; size = 2; break;
22126 case MVT::i32: Reg = X86::EAX; size = 4; break;
22128 assert(Subtarget.is64Bit() && "Node not type legal!");
22129 Reg = X86::RAX; size = 8;
22132 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22133 Op.getOperand(2), SDValue());
22134 SDValue Ops[] = { cpIn.getValue(0),
22137 DAG.getTargetConstant(size, DL, MVT::i8),
22138 cpIn.getValue(1) };
22139 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22140 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22141 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22145 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22146 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22147 MVT::i32, cpOut.getValue(2));
22148 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22150 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22151 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22152 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22156 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22157 SelectionDAG &DAG) {
22158 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22159 MVT DstVT = Op.getSimpleValueType();
22161 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22162 SrcVT == MVT::i64) {
22163 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22164 if (DstVT != MVT::f64)
22165 // This conversion needs to be expanded.
22168 SDValue Op0 = Op->getOperand(0);
22169 SmallVector<SDValue, 16> Elts;
22173 if (SrcVT.isVector()) {
22174 NumElts = SrcVT.getVectorNumElements();
22175 SVT = SrcVT.getVectorElementType();
22177 // Widen the vector in input in the case of MVT::v2i32.
22178 // Example: from MVT::v2i32 to MVT::v4i32.
22179 for (unsigned i = 0, e = NumElts; i != e; ++i)
22180 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22181 DAG.getIntPtrConstant(i, dl)));
22183 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22184 "Unexpected source type in LowerBITCAST");
22185 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22186 DAG.getIntPtrConstant(0, dl)));
22187 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22188 DAG.getIntPtrConstant(1, dl)));
22192 // Explicitly mark the extra elements as Undef.
22193 Elts.append(NumElts, DAG.getUNDEF(SVT));
22195 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22196 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22197 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22198 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22199 DAG.getIntPtrConstant(0, dl));
22202 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22203 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22204 assert((DstVT == MVT::i64 ||
22205 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22206 "Unexpected custom BITCAST");
22207 // i64 <=> MMX conversions are Legal.
22208 if (SrcVT==MVT::i64 && DstVT.isVector())
22210 if (DstVT==MVT::i64 && SrcVT.isVector())
22212 // MMX <=> MMX conversions are Legal.
22213 if (SrcVT.isVector() && DstVT.isVector())
22215 // All other conversions need to be expanded.
22219 /// Compute the horizontal sum of bytes in V for the elements of VT.
22221 /// Requires V to be a byte vector and VT to be an integer vector type with
22222 /// wider elements than V's type. The width of the elements of VT determines
22223 /// how many bytes of V are summed horizontally to produce each element of the
22225 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22226 const X86Subtarget &Subtarget,
22227 SelectionDAG &DAG) {
22229 MVT ByteVecVT = V.getSimpleValueType();
22230 MVT EltVT = VT.getVectorElementType();
22231 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22232 "Expected value to have byte element type.");
22233 assert(EltVT != MVT::i8 &&
22234 "Horizontal byte sum only makes sense for wider elements!");
22235 unsigned VecSize = VT.getSizeInBits();
22236 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22238 // PSADBW instruction horizontally add all bytes and leave the result in i64
22239 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22240 if (EltVT == MVT::i64) {
22241 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22242 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22243 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22244 return DAG.getBitcast(VT, V);
22247 if (EltVT == MVT::i32) {
22248 // We unpack the low half and high half into i32s interleaved with zeros so
22249 // that we can use PSADBW to horizontally sum them. The most useful part of
22250 // this is that it lines up the results of two PSADBW instructions to be
22251 // two v2i64 vectors which concatenated are the 4 population counts. We can
22252 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22253 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22254 SDValue V32 = DAG.getBitcast(VT, V);
22255 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22256 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22258 // Do the horizontal sums into two v2i64s.
22259 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22260 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22261 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22262 DAG.getBitcast(ByteVecVT, Low), Zeros);
22263 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22264 DAG.getBitcast(ByteVecVT, High), Zeros);
22266 // Merge them together.
22267 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22268 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22269 DAG.getBitcast(ShortVecVT, Low),
22270 DAG.getBitcast(ShortVecVT, High));
22272 return DAG.getBitcast(VT, V);
22275 // The only element type left is i16.
22276 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22278 // To obtain pop count for each i16 element starting from the pop count for
22279 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22280 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22281 // directly supported.
22282 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22283 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22284 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22285 DAG.getBitcast(ByteVecVT, V));
22286 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22289 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22290 const X86Subtarget &Subtarget,
22291 SelectionDAG &DAG) {
22292 MVT VT = Op.getSimpleValueType();
22293 MVT EltVT = VT.getVectorElementType();
22294 unsigned VecSize = VT.getSizeInBits();
22296 // Implement a lookup table in register by using an algorithm based on:
22297 // http://wm.ite.pl/articles/sse-popcount.html
22299 // The general idea is that every lower byte nibble in the input vector is an
22300 // index into a in-register pre-computed pop count table. We then split up the
22301 // input vector in two new ones: (1) a vector with only the shifted-right
22302 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22303 // masked out higher ones) for each byte. PSHUB is used separately with both
22304 // to index the in-register table. Next, both are added and the result is a
22305 // i8 vector where each element contains the pop count for input byte.
22307 // To obtain the pop count for elements != i8, we follow up with the same
22308 // approach and use additional tricks as described below.
22310 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22311 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22312 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22313 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22315 int NumByteElts = VecSize / 8;
22316 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22317 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22318 SmallVector<SDValue, 64> LUTVec;
22319 for (int i = 0; i < NumByteElts; ++i)
22320 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22321 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22322 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22325 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22326 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22329 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22331 // The input vector is used as the shuffle mask that index elements into the
22332 // LUT. After counting low and high nibbles, add the vector to obtain the
22333 // final pop count per i8 element.
22334 SDValue HighPopCnt =
22335 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22336 SDValue LowPopCnt =
22337 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22338 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22340 if (EltVT == MVT::i8)
22343 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22346 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22347 const X86Subtarget &Subtarget,
22348 SelectionDAG &DAG) {
22349 MVT VT = Op.getSimpleValueType();
22350 assert(VT.is128BitVector() &&
22351 "Only 128-bit vector bitmath lowering supported.");
22353 int VecSize = VT.getSizeInBits();
22354 MVT EltVT = VT.getVectorElementType();
22355 int Len = EltVT.getSizeInBits();
22357 // This is the vectorized version of the "best" algorithm from
22358 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22359 // with a minor tweak to use a series of adds + shifts instead of vector
22360 // multiplications. Implemented for all integer vector types. We only use
22361 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22362 // much faster, even faster than using native popcnt instructions.
22364 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
22365 MVT VT = V.getSimpleValueType();
22366 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
22367 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
22369 auto GetMask = [&](SDValue V, APInt Mask) {
22370 MVT VT = V.getSimpleValueType();
22371 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
22372 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
22375 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
22376 // x86, so set the SRL type to have elements at least i16 wide. This is
22377 // correct because all of our SRLs are followed immediately by a mask anyways
22378 // that handles any bits that sneak into the high bits of the byte elements.
22379 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
22383 // v = v - ((v >> 1) & 0x55555555...)
22385 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
22386 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
22387 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
22389 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
22390 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
22391 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
22392 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
22393 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
22395 // v = (v + (v >> 4)) & 0x0F0F0F0F...
22396 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
22397 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
22398 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
22400 // At this point, V contains the byte-wise population count, and we are
22401 // merely doing a horizontal sum if necessary to get the wider element
22403 if (EltVT == MVT::i8)
22406 return LowerHorizontalByteSum(
22407 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
22411 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
22412 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
22413 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22414 SelectionDAG &DAG) {
22415 MVT VT = Op.getSimpleValueType();
22416 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
22417 "Unknown CTPOP type to handle");
22418 SDLoc DL(Op.getNode());
22419 SDValue Op0 = Op.getOperand(0);
22421 if (!Subtarget.hasSSSE3()) {
22422 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
22423 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
22424 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
22427 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22428 unsigned NumElems = VT.getVectorNumElements();
22430 // Extract each 128-bit vector, compute pop count and concat the result.
22431 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
22432 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
22434 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22435 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22436 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22439 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
22440 unsigned NumElems = VT.getVectorNumElements();
22442 // Extract each 256-bit vector, compute pop count and concat the result.
22443 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
22444 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
22446 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22447 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22448 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22451 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
22454 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22455 SelectionDAG &DAG) {
22456 assert(Op.getSimpleValueType().isVector() &&
22457 "We only do custom lowering for vector population count.");
22458 return LowerVectorCTPOP(Op, Subtarget, DAG);
22461 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
22462 MVT VT = Op.getSimpleValueType();
22463 SDValue In = Op.getOperand(0);
22466 // For scalars, its still beneficial to transfer to/from the SIMD unit to
22467 // perform the BITREVERSE.
22468 if (!VT.isVector()) {
22469 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
22470 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
22471 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
22472 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
22473 DAG.getIntPtrConstant(0, DL));
22476 MVT SVT = VT.getVectorElementType();
22477 int NumElts = VT.getVectorNumElements();
22478 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
22480 // Decompose 256-bit ops into smaller 128-bit ops.
22481 if (VT.is256BitVector()) {
22482 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22483 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22485 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
22486 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22487 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
22488 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
22491 assert(VT.is128BitVector() &&
22492 "Only 128-bit vector bitreverse lowering supported.");
22494 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
22495 // perform the BSWAP in the shuffle.
22496 // Its best to shuffle using the second operand as this will implicitly allow
22497 // memory folding for multiple vectors.
22498 SmallVector<SDValue, 16> MaskElts;
22499 for (int i = 0; i != NumElts; ++i) {
22500 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
22501 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
22502 int PermuteByte = SourceByte | (2 << 5);
22503 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
22507 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
22508 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
22509 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
22511 return DAG.getBitcast(VT, Res);
22514 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
22515 SelectionDAG &DAG) {
22516 if (Subtarget.hasXOP())
22517 return LowerBITREVERSE_XOP(Op, DAG);
22519 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
22521 MVT VT = Op.getSimpleValueType();
22522 SDValue In = Op.getOperand(0);
22525 unsigned NumElts = VT.getVectorNumElements();
22526 assert(VT.getScalarType() == MVT::i8 &&
22527 "Only byte vector BITREVERSE supported");
22529 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
22530 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22531 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
22532 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22533 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22534 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
22535 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
22536 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22539 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
22540 // two nibbles and a PSHUFB lookup to find the bitreverse of each
22541 // 0-15 value (moved to the other nibble).
22542 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
22543 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
22544 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
22546 const int LoLUT[16] = {
22547 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
22548 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
22549 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
22550 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
22551 const int HiLUT[16] = {
22552 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
22553 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
22554 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
22555 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
22557 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
22558 for (unsigned i = 0; i < NumElts; ++i) {
22559 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
22560 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
22563 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
22564 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
22565 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
22566 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
22567 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
22570 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
22571 unsigned NewOpc = 0;
22572 switch (N->getOpcode()) {
22573 case ISD::ATOMIC_LOAD_ADD:
22574 NewOpc = X86ISD::LADD;
22576 case ISD::ATOMIC_LOAD_SUB:
22577 NewOpc = X86ISD::LSUB;
22579 case ISD::ATOMIC_LOAD_OR:
22580 NewOpc = X86ISD::LOR;
22582 case ISD::ATOMIC_LOAD_XOR:
22583 NewOpc = X86ISD::LXOR;
22585 case ISD::ATOMIC_LOAD_AND:
22586 NewOpc = X86ISD::LAND;
22589 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
22592 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
22593 return DAG.getMemIntrinsicNode(
22594 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
22595 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
22596 /*MemVT=*/N->getSimpleValueType(0), MMO);
22599 /// Lower atomic_load_ops into LOCK-prefixed operations.
22600 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
22601 const X86Subtarget &Subtarget) {
22602 SDValue Chain = N->getOperand(0);
22603 SDValue LHS = N->getOperand(1);
22604 SDValue RHS = N->getOperand(2);
22605 unsigned Opc = N->getOpcode();
22606 MVT VT = N->getSimpleValueType(0);
22609 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
22610 // can only be lowered when the result is unused. They should have already
22611 // been transformed into a cmpxchg loop in AtomicExpand.
22612 if (N->hasAnyUseOfValue(0)) {
22613 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
22614 // select LXADD if LOCK_SUB can't be selected.
22615 if (Opc == ISD::ATOMIC_LOAD_SUB) {
22616 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
22617 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
22618 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
22619 RHS, AN->getMemOperand());
22621 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
22622 "Used AtomicRMW ops other than Add should have been expanded!");
22626 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
22627 // RAUW the chain, but don't worry about the result, as it's unused.
22628 assert(!N->hasAnyUseOfValue(0));
22629 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
22633 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
22634 SDNode *Node = Op.getNode();
22636 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
22638 // Convert seq_cst store -> xchg
22639 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
22640 // FIXME: On 32-bit, store -> fist or movq would be more efficient
22641 // (The only way to get a 16-byte store is cmpxchg16b)
22642 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
22643 if (cast<AtomicSDNode>(Node)->getOrdering() ==
22644 AtomicOrdering::SequentiallyConsistent ||
22645 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
22646 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
22647 cast<AtomicSDNode>(Node)->getMemoryVT(),
22648 Node->getOperand(0),
22649 Node->getOperand(1), Node->getOperand(2),
22650 cast<AtomicSDNode>(Node)->getMemOperand());
22651 return Swap.getValue(1);
22653 // Other atomic stores have a simple pattern.
22657 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
22658 MVT VT = Op.getNode()->getSimpleValueType(0);
22660 // Let legalize expand this if it isn't a legal type yet.
22661 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
22664 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22667 bool ExtraOp = false;
22668 switch (Op.getOpcode()) {
22669 default: llvm_unreachable("Invalid code");
22670 case ISD::ADDC: Opc = X86ISD::ADD; break;
22671 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
22672 case ISD::SUBC: Opc = X86ISD::SUB; break;
22673 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
22677 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22679 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22680 Op.getOperand(1), Op.getOperand(2));
22683 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
22684 SelectionDAG &DAG) {
22685 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
22687 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
22688 // which returns the values as { float, float } (in XMM0) or
22689 // { double, double } (which is returned in XMM0, XMM1).
22691 SDValue Arg = Op.getOperand(0);
22692 EVT ArgVT = Arg.getValueType();
22693 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22695 TargetLowering::ArgListTy Args;
22696 TargetLowering::ArgListEntry Entry;
22700 Entry.isSExt = false;
22701 Entry.isZExt = false;
22702 Args.push_back(Entry);
22704 bool isF64 = ArgVT == MVT::f64;
22705 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
22706 // the small struct {f32, f32} is returned in (eax, edx). For f64,
22707 // the results are returned via SRet in memory.
22708 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
22709 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22711 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
22713 Type *RetTy = isF64
22714 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
22715 : (Type*)VectorType::get(ArgTy, 4);
22717 TargetLowering::CallLoweringInfo CLI(DAG);
22718 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
22719 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
22721 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
22724 // Returned in xmm0 and xmm1.
22725 return CallResult.first;
22727 // Returned in bits 0:31 and 32:64 xmm0.
22728 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22729 CallResult.first, DAG.getIntPtrConstant(0, dl));
22730 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22731 CallResult.first, DAG.getIntPtrConstant(1, dl));
22732 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
22733 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
22736 /// Widen a vector input to a vector of NVT. The
22737 /// input vector must have the same element type as NVT.
22738 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
22739 bool FillWithZeroes = false) {
22740 // Check if InOp already has the right width.
22741 MVT InVT = InOp.getSimpleValueType();
22745 if (InOp.isUndef())
22746 return DAG.getUNDEF(NVT);
22748 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
22749 "input and widen element type must match");
22751 unsigned InNumElts = InVT.getVectorNumElements();
22752 unsigned WidenNumElts = NVT.getVectorNumElements();
22753 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
22754 "Unexpected request for vector widening");
22756 EVT EltVT = NVT.getVectorElementType();
22759 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
22760 InOp.getNumOperands() == 2) {
22761 SDValue N1 = InOp.getOperand(1);
22762 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
22764 InOp = InOp.getOperand(0);
22765 InVT = InOp.getSimpleValueType();
22766 InNumElts = InVT.getVectorNumElements();
22769 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
22770 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
22771 SmallVector<SDValue, 16> Ops;
22772 for (unsigned i = 0; i < InNumElts; ++i)
22773 Ops.push_back(InOp.getOperand(i));
22775 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
22776 DAG.getUNDEF(EltVT);
22777 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
22778 Ops.push_back(FillVal);
22779 return DAG.getBuildVector(NVT, dl, Ops);
22781 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
22783 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
22784 InOp, DAG.getIntPtrConstant(0, dl));
22787 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
22788 SelectionDAG &DAG) {
22789 assert(Subtarget.hasAVX512() &&
22790 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22792 // X86 scatter kills mask register, so its type should be added to
22793 // the list of return values.
22794 // If the "scatter" has 2 return values, it is already handled.
22795 if (Op.getNode()->getNumValues() == 2)
22798 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
22799 SDValue Src = N->getValue();
22800 MVT VT = Src.getSimpleValueType();
22801 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
22804 SDValue NewScatter;
22805 SDValue Index = N->getIndex();
22806 SDValue Mask = N->getMask();
22807 SDValue Chain = N->getChain();
22808 SDValue BasePtr = N->getBasePtr();
22809 MVT MemVT = N->getMemoryVT().getSimpleVT();
22810 MVT IndexVT = Index.getSimpleValueType();
22811 MVT MaskVT = Mask.getSimpleValueType();
22813 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
22814 // The v2i32 value was promoted to v2i64.
22815 // Now we "redo" the type legalizer's work and widen the original
22816 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
22818 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
22819 "Unexpected memory type");
22820 int ShuffleMask[] = {0, 2, -1, -1};
22821 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
22822 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
22823 // Now we have 4 elements instead of 2.
22824 // Expand the index.
22825 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
22826 Index = ExtendToType(Index, NewIndexVT, DAG);
22828 // Expand the mask with zeroes
22829 // Mask may be <2 x i64> or <2 x i1> at this moment
22830 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
22831 "Unexpected mask type");
22832 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
22833 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
22837 unsigned NumElts = VT.getVectorNumElements();
22838 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
22839 !Index.getSimpleValueType().is512BitVector()) {
22840 // AVX512F supports only 512-bit vectors. Or data or index should
22841 // be 512 bit wide. If now the both index and data are 256-bit, but
22842 // the vector contains 8 elements, we just sign-extend the index
22843 if (IndexVT == MVT::v8i32)
22844 // Just extend index
22845 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22847 // The minimal number of elts in scatter is 8
22850 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
22851 // Use original index here, do not modify the index twice
22852 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
22853 if (IndexVT.getScalarType() == MVT::i32)
22854 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22857 // At this point we have promoted mask operand
22858 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
22859 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
22860 // Use the original mask here, do not modify the mask twice
22861 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
22863 // The value that should be stored
22864 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
22865 Src = ExtendToType(Src, NewVT, DAG);
22868 // If the mask is "wide" at this point - truncate it to i1 vector
22869 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
22870 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
22872 // The mask is killed by scatter, add it to the values
22873 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
22874 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
22875 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
22876 N->getMemOperand());
22877 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
22878 return SDValue(NewScatter.getNode(), 1);
22881 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
22882 SelectionDAG &DAG) {
22884 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
22885 MVT VT = Op.getSimpleValueType();
22886 MVT ScalarVT = VT.getScalarType();
22887 SDValue Mask = N->getMask();
22890 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
22891 "Expanding masked load is supported on AVX-512 target only!");
22893 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
22894 "Expanding masked load is supported for 32 and 64-bit types only!");
22896 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
22897 // VLX. These types for exp-loads are handled here.
22898 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
22901 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22902 "Cannot lower masked load op.");
22904 assert((ScalarVT.getSizeInBits() >= 32 ||
22905 (Subtarget.hasBWI() &&
22906 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22907 "Unsupported masked load op.");
22909 // This operation is legal for targets with VLX, but without
22910 // VLX the vector should be widened to 512 bit
22911 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
22912 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22913 SDValue Src0 = N->getSrc0();
22914 Src0 = ExtendToType(Src0, WideDataVT, DAG);
22916 // Mask element has to be i1.
22917 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22918 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22919 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22921 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22923 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22924 if (MaskEltTy != MVT::i1)
22925 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22926 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22927 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
22928 N->getBasePtr(), Mask, Src0,
22929 N->getMemoryVT(), N->getMemOperand(),
22930 N->getExtensionType(),
22931 N->isExpandingLoad());
22933 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
22934 NewLoad.getValue(0),
22935 DAG.getIntPtrConstant(0, dl));
22936 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
22937 return DAG.getMergeValues(RetOps, dl);
22940 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
22941 SelectionDAG &DAG) {
22942 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
22943 SDValue DataToStore = N->getValue();
22944 MVT VT = DataToStore.getSimpleValueType();
22945 MVT ScalarVT = VT.getScalarType();
22946 SDValue Mask = N->getMask();
22949 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
22950 "Expanding masked load is supported on AVX-512 target only!");
22952 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
22953 "Expanding masked load is supported for 32 and 64-bit types only!");
22955 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
22956 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
22959 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22960 "Cannot lower masked store op.");
22962 assert((ScalarVT.getSizeInBits() >= 32 ||
22963 (Subtarget.hasBWI() &&
22964 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22965 "Unsupported masked store op.");
22967 // This operation is legal for targets with VLX, but without
22968 // VLX the vector should be widened to 512 bit
22969 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
22970 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22972 // Mask element has to be i1.
22973 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22974 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22975 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22977 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22979 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
22980 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22981 if (MaskEltTy != MVT::i1)
22982 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22983 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22984 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
22985 Mask, N->getMemoryVT(), N->getMemOperand(),
22986 N->isTruncatingStore(), N->isCompressingStore());
22989 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
22990 SelectionDAG &DAG) {
22991 assert(Subtarget.hasAVX512() &&
22992 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22994 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
22996 MVT VT = Op.getSimpleValueType();
22997 SDValue Index = N->getIndex();
22998 SDValue Mask = N->getMask();
22999 SDValue Src0 = N->getValue();
23000 MVT IndexVT = Index.getSimpleValueType();
23001 MVT MaskVT = Mask.getSimpleValueType();
23003 unsigned NumElts = VT.getVectorNumElements();
23004 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23006 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23007 !Index.getSimpleValueType().is512BitVector()) {
23008 // AVX512F supports only 512-bit vectors. Or data or index should
23009 // be 512 bit wide. If now the both index and data are 256-bit, but
23010 // the vector contains 8 elements, we just sign-extend the index
23011 if (NumElts == 8) {
23012 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23013 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23014 N->getOperand(3), Index };
23015 DAG.UpdateNodeOperands(N, Ops);
23019 // Minimal number of elements in Gather
23022 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23023 Index = ExtendToType(Index, NewIndexVT, DAG);
23024 if (IndexVT.getScalarType() == MVT::i32)
23025 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23028 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23029 // At this point we have promoted mask operand
23030 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23031 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23032 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23033 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23035 // The pass-thru value
23036 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23037 Src0 = ExtendToType(Src0, NewVT, DAG);
23039 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23040 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23041 N->getMemoryVT(), dl, Ops,
23042 N->getMemOperand());
23043 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23044 NewGather.getValue(0),
23045 DAG.getIntPtrConstant(0, dl));
23046 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23047 return DAG.getMergeValues(RetOps, dl);
23052 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23053 SelectionDAG &DAG) const {
23054 // TODO: Eventually, the lowering of these nodes should be informed by or
23055 // deferred to the GC strategy for the function in which they appear. For
23056 // now, however, they must be lowered to something. Since they are logically
23057 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23058 // require special handling for these nodes), lower them as literal NOOPs for
23060 SmallVector<SDValue, 2> Ops;
23062 Ops.push_back(Op.getOperand(0));
23063 if (Op->getGluedNode())
23064 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23067 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23068 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23073 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23074 SelectionDAG &DAG) const {
23075 // TODO: Eventually, the lowering of these nodes should be informed by or
23076 // deferred to the GC strategy for the function in which they appear. For
23077 // now, however, they must be lowered to something. Since they are logically
23078 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23079 // require special handling for these nodes), lower them as literal NOOPs for
23081 SmallVector<SDValue, 2> Ops;
23083 Ops.push_back(Op.getOperand(0));
23084 if (Op->getGluedNode())
23085 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23088 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23089 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23094 /// Provide custom lowering hooks for some operations.
23095 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23096 switch (Op.getOpcode()) {
23097 default: llvm_unreachable("Should not custom lower this!");
23098 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23099 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23100 return LowerCMP_SWAP(Op, Subtarget, DAG);
23101 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23102 case ISD::ATOMIC_LOAD_ADD:
23103 case ISD::ATOMIC_LOAD_SUB:
23104 case ISD::ATOMIC_LOAD_OR:
23105 case ISD::ATOMIC_LOAD_XOR:
23106 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23107 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23108 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23109 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23110 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23111 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23112 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23113 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23114 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23115 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23116 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23117 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
23118 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23119 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23120 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23121 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23122 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23123 case ISD::SHL_PARTS:
23124 case ISD::SRA_PARTS:
23125 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23126 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23127 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23128 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23129 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23130 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23131 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23132 case ISD::ZERO_EXTEND_VECTOR_INREG:
23133 case ISD::SIGN_EXTEND_VECTOR_INREG:
23134 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23135 case ISD::FP_TO_SINT:
23136 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
23137 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23138 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23140 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23141 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23142 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23143 case ISD::SETCC: return LowerSETCC(Op, DAG);
23144 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23145 case ISD::SELECT: return LowerSELECT(Op, DAG);
23146 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23147 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23148 case ISD::VASTART: return LowerVASTART(Op, DAG);
23149 case ISD::VAARG: return LowerVAARG(Op, DAG);
23150 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23151 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23152 case ISD::INTRINSIC_VOID:
23153 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23154 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23155 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23156 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23157 case ISD::FRAME_TO_ARGS_OFFSET:
23158 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23159 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23160 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23161 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23162 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23163 case ISD::EH_SJLJ_SETUP_DISPATCH:
23164 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23165 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23166 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23167 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23169 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23171 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23172 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23174 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23175 case ISD::UMUL_LOHI:
23176 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23177 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23180 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23186 case ISD::UMULO: return LowerXALUO(Op, DAG);
23187 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23188 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23192 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23193 case ISD::ADD: return LowerADD(Op, DAG);
23194 case ISD::SUB: return LowerSUB(Op, DAG);
23198 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23199 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23200 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23201 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23202 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23203 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23204 case ISD::GC_TRANSITION_START:
23205 return LowerGC_TRANSITION_START(Op, DAG);
23206 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23207 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23211 /// Places new result values for the node in Results (their number
23212 /// and types must exactly match those of the original return values of
23213 /// the node), or leaves Results empty, which indicates that the node is not
23214 /// to be custom lowered after all.
23215 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23216 SmallVectorImpl<SDValue> &Results,
23217 SelectionDAG &DAG) const {
23218 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23220 if (!Res.getNode())
23223 assert((N->getNumValues() <= Res->getNumValues()) &&
23224 "Lowering returned the wrong number of results!");
23226 // Places new result values base on N result number.
23227 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23228 // than original node, chain should be dropped(last value).
23229 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23230 Results.push_back(Res.getValue(I));
23233 /// Replace a node with an illegal result type with a new node built out of
23235 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23236 SmallVectorImpl<SDValue>&Results,
23237 SelectionDAG &DAG) const {
23239 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23240 switch (N->getOpcode()) {
23242 llvm_unreachable("Do not know how to custom type legalize this operation!");
23243 case X86ISD::AVG: {
23244 // Legalize types for X86ISD::AVG by expanding vectors.
23245 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23247 auto InVT = N->getValueType(0);
23248 auto InVTSize = InVT.getSizeInBits();
23249 const unsigned RegSize =
23250 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23251 assert((Subtarget.hasBWI() || RegSize < 512) &&
23252 "512-bit vector requires AVX512BW");
23253 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23254 "256-bit vector requires AVX2");
23256 auto ElemVT = InVT.getVectorElementType();
23257 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23258 RegSize / ElemVT.getSizeInBits());
23259 assert(RegSize % InVT.getSizeInBits() == 0);
23260 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23262 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23263 Ops[0] = N->getOperand(0);
23264 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23265 Ops[0] = N->getOperand(1);
23266 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23268 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23269 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23270 DAG.getIntPtrConstant(0, dl)));
23273 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23274 case X86ISD::FMINC:
23276 case X86ISD::FMAXC:
23277 case X86ISD::FMAX: {
23278 EVT VT = N->getValueType(0);
23279 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23280 SDValue UNDEF = DAG.getUNDEF(VT);
23281 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23282 N->getOperand(0), UNDEF);
23283 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23284 N->getOperand(1), UNDEF);
23285 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23293 case ISD::UDIVREM: {
23294 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23295 Results.push_back(V);
23298 case ISD::FP_TO_SINT:
23299 case ISD::FP_TO_UINT: {
23300 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23302 if (N->getValueType(0) == MVT::v2i32) {
23303 assert((IsSigned || Subtarget.hasAVX512()) &&
23304 "Can only handle signed conversion without AVX512");
23305 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23306 SDValue Src = N->getOperand(0);
23307 if (Src.getValueType() == MVT::v2f64) {
23308 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23309 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23310 : X86ISD::CVTTP2UI,
23311 dl, MVT::v4i32, Src);
23312 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23313 Results.push_back(Res);
23316 if (Src.getValueType() == MVT::v2f32) {
23317 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23318 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23319 DAG.getUNDEF(MVT::v2f32));
23320 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23321 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23322 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23323 Results.push_back(Res);
23327 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23328 // so early out here.
23332 std::pair<SDValue,SDValue> Vals =
23333 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23334 SDValue FIST = Vals.first, StackSlot = Vals.second;
23335 if (FIST.getNode()) {
23336 EVT VT = N->getValueType(0);
23337 // Return a load from the stack slot.
23338 if (StackSlot.getNode())
23340 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23342 Results.push_back(FIST);
23346 case ISD::SINT_TO_FP: {
23347 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23348 SDValue Src = N->getOperand(0);
23349 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23351 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23354 case ISD::UINT_TO_FP: {
23355 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23356 EVT VT = N->getValueType(0);
23357 if (VT != MVT::v2f32)
23359 SDValue Src = N->getOperand(0);
23360 EVT SrcVT = Src.getValueType();
23361 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23362 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23365 if (SrcVT != MVT::v2i32)
23367 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23369 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23370 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23371 DAG.getBitcast(MVT::v2i64, VBias));
23372 Or = DAG.getBitcast(MVT::v2f64, Or);
23373 // TODO: Are there any fast-math-flags to propagate here?
23374 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23375 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23378 case ISD::FP_ROUND: {
23379 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23381 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23382 Results.push_back(V);
23385 case ISD::FP_EXTEND: {
23386 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23387 // No other ValueType for FP_EXTEND should reach this point.
23388 assert(N->getValueType(0) == MVT::v2f32 &&
23389 "Do not know how to legalize this Node");
23392 case ISD::INTRINSIC_W_CHAIN: {
23393 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
23395 default : llvm_unreachable("Do not know how to custom type "
23396 "legalize this intrinsic operation!");
23397 case Intrinsic::x86_rdtsc:
23398 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23400 case Intrinsic::x86_rdtscp:
23401 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
23403 case Intrinsic::x86_rdpmc:
23404 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
23406 case Intrinsic::x86_xgetbv:
23407 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
23410 case ISD::INTRINSIC_WO_CHAIN: {
23411 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
23412 Results.push_back(V);
23415 case ISD::READCYCLECOUNTER: {
23416 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23419 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
23420 EVT T = N->getValueType(0);
23421 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
23422 bool Regs64bit = T == MVT::i128;
23423 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
23424 SDValue cpInL, cpInH;
23425 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23426 DAG.getConstant(0, dl, HalfT));
23427 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23428 DAG.getConstant(1, dl, HalfT));
23429 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
23430 Regs64bit ? X86::RAX : X86::EAX,
23432 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
23433 Regs64bit ? X86::RDX : X86::EDX,
23434 cpInH, cpInL.getValue(1));
23435 SDValue swapInL, swapInH;
23436 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23437 DAG.getConstant(0, dl, HalfT));
23438 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23439 DAG.getConstant(1, dl, HalfT));
23441 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
23442 swapInH, cpInH.getValue(1));
23443 // If the current function needs the base pointer, RBX,
23444 // we shouldn't use cmpxchg directly.
23445 // Indeed the lowering of that instruction will clobber
23446 // that register and since RBX will be a reserved register
23447 // the register allocator will not make sure its value will
23448 // be properly saved and restored around this live-range.
23449 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
23451 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23452 unsigned BasePtr = TRI->getBaseRegister();
23453 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
23454 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
23455 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
23456 // ISel prefers the LCMPXCHG64 variant.
23457 // If that assert breaks, that means it is not the case anymore,
23458 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
23459 // not just EBX. This is a matter of accepting i64 input for that
23460 // pseudo, and restoring into the register of the right wide
23461 // in expand pseudo. Everything else should just work.
23462 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
23463 "Saving only half of the RBX");
23464 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
23465 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
23466 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
23467 Regs64bit ? X86::RBX : X86::EBX,
23468 HalfT, swapInH.getValue(1));
23469 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
23471 /*Glue*/ RBXSave.getValue(2)};
23472 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23475 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
23476 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
23477 Regs64bit ? X86::RBX : X86::EBX, swapInL,
23478 swapInH.getValue(1));
23479 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
23480 swapInL.getValue(1)};
23481 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23483 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
23484 Regs64bit ? X86::RAX : X86::EAX,
23485 HalfT, Result.getValue(1));
23486 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
23487 Regs64bit ? X86::RDX : X86::EDX,
23488 HalfT, cpOutL.getValue(2));
23489 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
23491 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
23492 MVT::i32, cpOutH.getValue(2));
23493 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
23494 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
23496 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
23497 Results.push_back(Success);
23498 Results.push_back(EFLAGS.getValue(1));
23501 case ISD::ATOMIC_SWAP:
23502 case ISD::ATOMIC_LOAD_ADD:
23503 case ISD::ATOMIC_LOAD_SUB:
23504 case ISD::ATOMIC_LOAD_AND:
23505 case ISD::ATOMIC_LOAD_OR:
23506 case ISD::ATOMIC_LOAD_XOR:
23507 case ISD::ATOMIC_LOAD_NAND:
23508 case ISD::ATOMIC_LOAD_MIN:
23509 case ISD::ATOMIC_LOAD_MAX:
23510 case ISD::ATOMIC_LOAD_UMIN:
23511 case ISD::ATOMIC_LOAD_UMAX:
23512 case ISD::ATOMIC_LOAD: {
23513 // Delegate to generic TypeLegalization. Situations we can really handle
23514 // should have already been dealt with by AtomicExpandPass.cpp.
23517 case ISD::BITCAST: {
23518 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23519 EVT DstVT = N->getValueType(0);
23520 EVT SrcVT = N->getOperand(0)->getValueType(0);
23522 if (SrcVT != MVT::f64 ||
23523 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
23526 unsigned NumElts = DstVT.getVectorNumElements();
23527 EVT SVT = DstVT.getVectorElementType();
23528 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23529 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
23530 MVT::v2f64, N->getOperand(0));
23531 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
23533 if (ExperimentalVectorWideningLegalization) {
23534 // If we are legalizing vectors by widening, we already have the desired
23535 // legal vector type, just return it.
23536 Results.push_back(ToVecInt);
23540 SmallVector<SDValue, 8> Elts;
23541 for (unsigned i = 0, e = NumElts; i != e; ++i)
23542 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
23543 ToVecInt, DAG.getIntPtrConstant(i, dl)));
23545 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
23550 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
23551 switch ((X86ISD::NodeType)Opcode) {
23552 case X86ISD::FIRST_NUMBER: break;
23553 case X86ISD::BSF: return "X86ISD::BSF";
23554 case X86ISD::BSR: return "X86ISD::BSR";
23555 case X86ISD::SHLD: return "X86ISD::SHLD";
23556 case X86ISD::SHRD: return "X86ISD::SHRD";
23557 case X86ISD::FAND: return "X86ISD::FAND";
23558 case X86ISD::FANDN: return "X86ISD::FANDN";
23559 case X86ISD::FOR: return "X86ISD::FOR";
23560 case X86ISD::FXOR: return "X86ISD::FXOR";
23561 case X86ISD::FILD: return "X86ISD::FILD";
23562 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
23563 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
23564 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
23565 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
23566 case X86ISD::FLD: return "X86ISD::FLD";
23567 case X86ISD::FST: return "X86ISD::FST";
23568 case X86ISD::CALL: return "X86ISD::CALL";
23569 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
23570 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
23571 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
23572 case X86ISD::BT: return "X86ISD::BT";
23573 case X86ISD::CMP: return "X86ISD::CMP";
23574 case X86ISD::COMI: return "X86ISD::COMI";
23575 case X86ISD::UCOMI: return "X86ISD::UCOMI";
23576 case X86ISD::CMPM: return "X86ISD::CMPM";
23577 case X86ISD::CMPMU: return "X86ISD::CMPMU";
23578 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
23579 case X86ISD::SETCC: return "X86ISD::SETCC";
23580 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
23581 case X86ISD::FSETCC: return "X86ISD::FSETCC";
23582 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
23583 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
23584 case X86ISD::CMOV: return "X86ISD::CMOV";
23585 case X86ISD::BRCOND: return "X86ISD::BRCOND";
23586 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
23587 case X86ISD::IRET: return "X86ISD::IRET";
23588 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
23589 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
23590 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
23591 case X86ISD::Wrapper: return "X86ISD::Wrapper";
23592 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
23593 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
23594 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
23595 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
23596 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
23597 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
23598 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
23599 case X86ISD::PINSRB: return "X86ISD::PINSRB";
23600 case X86ISD::PINSRW: return "X86ISD::PINSRW";
23601 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
23602 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
23603 case X86ISD::ANDNP: return "X86ISD::ANDNP";
23604 case X86ISD::BLENDI: return "X86ISD::BLENDI";
23605 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
23606 case X86ISD::ADDUS: return "X86ISD::ADDUS";
23607 case X86ISD::SUBUS: return "X86ISD::SUBUS";
23608 case X86ISD::HADD: return "X86ISD::HADD";
23609 case X86ISD::HSUB: return "X86ISD::HSUB";
23610 case X86ISD::FHADD: return "X86ISD::FHADD";
23611 case X86ISD::FHSUB: return "X86ISD::FHSUB";
23612 case X86ISD::ABS: return "X86ISD::ABS";
23613 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
23614 case X86ISD::FMAX: return "X86ISD::FMAX";
23615 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
23616 case X86ISD::FMIN: return "X86ISD::FMIN";
23617 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
23618 case X86ISD::FMAXC: return "X86ISD::FMAXC";
23619 case X86ISD::FMINC: return "X86ISD::FMINC";
23620 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
23621 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
23622 case X86ISD::FRCP: return "X86ISD::FRCP";
23623 case X86ISD::FRCPS: return "X86ISD::FRCPS";
23624 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
23625 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
23626 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
23627 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
23628 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
23629 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
23630 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
23631 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
23632 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
23633 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
23634 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
23635 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
23636 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
23637 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
23638 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
23639 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
23640 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
23641 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
23642 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
23643 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
23644 case X86ISD::LADD: return "X86ISD::LADD";
23645 case X86ISD::LSUB: return "X86ISD::LSUB";
23646 case X86ISD::LOR: return "X86ISD::LOR";
23647 case X86ISD::LXOR: return "X86ISD::LXOR";
23648 case X86ISD::LAND: return "X86ISD::LAND";
23649 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
23650 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
23651 case X86ISD::VZEXT: return "X86ISD::VZEXT";
23652 case X86ISD::VSEXT: return "X86ISD::VSEXT";
23653 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
23654 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
23655 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
23656 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
23657 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
23658 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
23659 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
23660 case X86ISD::VINSERT: return "X86ISD::VINSERT";
23661 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
23662 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
23663 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
23664 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
23665 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
23666 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
23667 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
23668 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
23669 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
23670 case X86ISD::VSHL: return "X86ISD::VSHL";
23671 case X86ISD::VSRL: return "X86ISD::VSRL";
23672 case X86ISD::VSRA: return "X86ISD::VSRA";
23673 case X86ISD::VSHLI: return "X86ISD::VSHLI";
23674 case X86ISD::VSRLI: return "X86ISD::VSRLI";
23675 case X86ISD::VSRAI: return "X86ISD::VSRAI";
23676 case X86ISD::VSRAV: return "X86ISD::VSRAV";
23677 case X86ISD::VROTLI: return "X86ISD::VROTLI";
23678 case X86ISD::VROTRI: return "X86ISD::VROTRI";
23679 case X86ISD::VPPERM: return "X86ISD::VPPERM";
23680 case X86ISD::CMPP: return "X86ISD::CMPP";
23681 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
23682 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
23683 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
23684 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
23685 case X86ISD::ADD: return "X86ISD::ADD";
23686 case X86ISD::SUB: return "X86ISD::SUB";
23687 case X86ISD::ADC: return "X86ISD::ADC";
23688 case X86ISD::SBB: return "X86ISD::SBB";
23689 case X86ISD::SMUL: return "X86ISD::SMUL";
23690 case X86ISD::UMUL: return "X86ISD::UMUL";
23691 case X86ISD::SMUL8: return "X86ISD::SMUL8";
23692 case X86ISD::UMUL8: return "X86ISD::UMUL8";
23693 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
23694 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
23695 case X86ISD::INC: return "X86ISD::INC";
23696 case X86ISD::DEC: return "X86ISD::DEC";
23697 case X86ISD::OR: return "X86ISD::OR";
23698 case X86ISD::XOR: return "X86ISD::XOR";
23699 case X86ISD::AND: return "X86ISD::AND";
23700 case X86ISD::BEXTR: return "X86ISD::BEXTR";
23701 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
23702 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
23703 case X86ISD::PTEST: return "X86ISD::PTEST";
23704 case X86ISD::TESTP: return "X86ISD::TESTP";
23705 case X86ISD::TESTM: return "X86ISD::TESTM";
23706 case X86ISD::TESTNM: return "X86ISD::TESTNM";
23707 case X86ISD::KORTEST: return "X86ISD::KORTEST";
23708 case X86ISD::KTEST: return "X86ISD::KTEST";
23709 case X86ISD::PACKSS: return "X86ISD::PACKSS";
23710 case X86ISD::PACKUS: return "X86ISD::PACKUS";
23711 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
23712 case X86ISD::VALIGN: return "X86ISD::VALIGN";
23713 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
23714 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
23715 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
23716 case X86ISD::SHUFP: return "X86ISD::SHUFP";
23717 case X86ISD::SHUF128: return "X86ISD::SHUF128";
23718 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
23719 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
23720 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
23721 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
23722 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
23723 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
23724 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
23725 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
23726 case X86ISD::MOVSD: return "X86ISD::MOVSD";
23727 case X86ISD::MOVSS: return "X86ISD::MOVSS";
23728 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
23729 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
23730 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
23731 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
23732 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
23733 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
23734 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
23735 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
23736 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
23737 case X86ISD::VPERMV: return "X86ISD::VPERMV";
23738 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
23739 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
23740 case X86ISD::VPERMI: return "X86ISD::VPERMI";
23741 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
23742 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
23743 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
23744 case X86ISD::VRANGE: return "X86ISD::VRANGE";
23745 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
23746 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
23747 case X86ISD::PSADBW: return "X86ISD::PSADBW";
23748 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
23749 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
23750 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
23751 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
23752 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
23753 case X86ISD::MFENCE: return "X86ISD::MFENCE";
23754 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
23755 case X86ISD::SAHF: return "X86ISD::SAHF";
23756 case X86ISD::RDRAND: return "X86ISD::RDRAND";
23757 case X86ISD::RDSEED: return "X86ISD::RDSEED";
23758 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
23759 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
23760 case X86ISD::VPROT: return "X86ISD::VPROT";
23761 case X86ISD::VPROTI: return "X86ISD::VPROTI";
23762 case X86ISD::VPSHA: return "X86ISD::VPSHA";
23763 case X86ISD::VPSHL: return "X86ISD::VPSHL";
23764 case X86ISD::VPCOM: return "X86ISD::VPCOM";
23765 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
23766 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
23767 case X86ISD::FMADD: return "X86ISD::FMADD";
23768 case X86ISD::FMSUB: return "X86ISD::FMSUB";
23769 case X86ISD::FNMADD: return "X86ISD::FNMADD";
23770 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
23771 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
23772 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
23773 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
23774 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
23775 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
23776 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
23777 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
23778 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
23779 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
23780 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
23781 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
23782 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
23783 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
23784 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
23785 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
23786 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
23787 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
23788 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
23789 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
23790 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
23791 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
23792 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
23793 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
23794 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
23795 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
23796 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
23797 case X86ISD::XTEST: return "X86ISD::XTEST";
23798 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
23799 case X86ISD::EXPAND: return "X86ISD::EXPAND";
23800 case X86ISD::SELECT: return "X86ISD::SELECT";
23801 case X86ISD::SELECTS: return "X86ISD::SELECTS";
23802 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
23803 case X86ISD::RCP28: return "X86ISD::RCP28";
23804 case X86ISD::RCP28S: return "X86ISD::RCP28S";
23805 case X86ISD::EXP2: return "X86ISD::EXP2";
23806 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
23807 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
23808 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
23809 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
23810 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
23811 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
23812 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
23813 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
23814 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
23815 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
23816 case X86ISD::SCALEF: return "X86ISD::SCALEF";
23817 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
23818 case X86ISD::ADDS: return "X86ISD::ADDS";
23819 case X86ISD::SUBS: return "X86ISD::SUBS";
23820 case X86ISD::AVG: return "X86ISD::AVG";
23821 case X86ISD::MULHRS: return "X86ISD::MULHRS";
23822 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
23823 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
23824 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
23825 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
23826 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
23827 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
23828 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
23829 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
23830 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
23831 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
23832 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
23833 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
23834 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
23835 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
23836 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
23837 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
23838 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
23839 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
23840 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
23841 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
23842 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
23843 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
23844 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
23849 /// Return true if the addressing mode represented by AM is legal for this
23850 /// target, for a load/store of the specified type.
23851 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
23852 const AddrMode &AM, Type *Ty,
23853 unsigned AS) const {
23854 // X86 supports extremely general addressing modes.
23855 CodeModel::Model M = getTargetMachine().getCodeModel();
23857 // X86 allows a sign-extended 32-bit immediate field as a displacement.
23858 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
23862 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
23864 // If a reference to this global requires an extra load, we can't fold it.
23865 if (isGlobalStubReference(GVFlags))
23868 // If BaseGV requires a register for the PIC base, we cannot also have a
23869 // BaseReg specified.
23870 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
23873 // If lower 4G is not available, then we must use rip-relative addressing.
23874 if ((M != CodeModel::Small || isPositionIndependent()) &&
23875 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
23879 switch (AM.Scale) {
23885 // These scales always work.
23890 // These scales are formed with basereg+scalereg. Only accept if there is
23895 default: // Other stuff never works.
23902 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
23903 unsigned Bits = Ty->getScalarSizeInBits();
23905 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
23906 // particularly cheaper than those without.
23910 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
23911 // variable shifts just as cheap as scalar ones.
23912 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
23915 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
23916 // fully general vector.
23920 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
23921 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23923 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
23924 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
23925 return NumBits1 > NumBits2;
23928 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
23929 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23932 if (!isTypeLegal(EVT::getEVT(Ty1)))
23935 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
23937 // Assuming the caller doesn't have a zeroext or signext return parameter,
23938 // truncation all the way down to i1 is valid.
23942 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
23943 return isInt<32>(Imm);
23946 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
23947 // Can also use sub to handle negated immediates.
23948 return isInt<32>(Imm);
23951 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
23952 if (!VT1.isInteger() || !VT2.isInteger())
23954 unsigned NumBits1 = VT1.getSizeInBits();
23955 unsigned NumBits2 = VT2.getSizeInBits();
23956 return NumBits1 > NumBits2;
23959 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
23960 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23961 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
23964 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
23965 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23966 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
23969 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
23970 EVT VT1 = Val.getValueType();
23971 if (isZExtFree(VT1, VT2))
23974 if (Val.getOpcode() != ISD::LOAD)
23977 if (!VT1.isSimple() || !VT1.isInteger() ||
23978 !VT2.isSimple() || !VT2.isInteger())
23981 switch (VT1.getSimpleVT().SimpleTy) {
23986 // X86 has 8, 16, and 32-bit zero-extending loads.
23993 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
23996 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
23997 if (!Subtarget.hasAnyFMA())
24000 VT = VT.getScalarType();
24002 if (!VT.isSimple())
24005 switch (VT.getSimpleVT().SimpleTy) {
24016 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24017 // i16 instructions are longer (0x66 prefix) and potentially slower.
24018 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24021 /// Targets can use this to indicate that they only support *some*
24022 /// VECTOR_SHUFFLE operations, those with specific masks.
24023 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24024 /// are assumed to be legal.
24026 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24028 if (!VT.isSimple())
24031 // Not for i1 vectors
24032 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24035 // Very little shuffling can be done for 64-bit vectors right now.
24036 if (VT.getSimpleVT().getSizeInBits() == 64)
24039 // We only care that the types being shuffled are legal. The lowering can
24040 // handle any possible shuffle mask that results.
24041 return isTypeLegal(VT.getSimpleVT());
24045 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24047 // Just delegate to the generic legality, clear masks aren't special.
24048 return isShuffleMaskLegal(Mask, VT);
24051 //===----------------------------------------------------------------------===//
24052 // X86 Scheduler Hooks
24053 //===----------------------------------------------------------------------===//
24055 /// Utility function to emit xbegin specifying the start of an RTM region.
24056 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24057 const TargetInstrInfo *TII) {
24058 DebugLoc DL = MI.getDebugLoc();
24060 const BasicBlock *BB = MBB->getBasicBlock();
24061 MachineFunction::iterator I = ++MBB->getIterator();
24063 // For the v = xbegin(), we generate
24074 MachineBasicBlock *thisMBB = MBB;
24075 MachineFunction *MF = MBB->getParent();
24076 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24077 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24078 MF->insert(I, mainMBB);
24079 MF->insert(I, sinkMBB);
24081 // Transfer the remainder of BB and its successor edges to sinkMBB.
24082 sinkMBB->splice(sinkMBB->begin(), MBB,
24083 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24084 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24088 // # fallthrough to mainMBB
24089 // # abortion to sinkMBB
24090 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
24091 thisMBB->addSuccessor(mainMBB);
24092 thisMBB->addSuccessor(sinkMBB);
24096 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
24097 mainMBB->addSuccessor(sinkMBB);
24100 // EAX is live into the sinkMBB
24101 sinkMBB->addLiveIn(X86::EAX);
24102 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
24103 MI.getOperand(0).getReg())
24106 MI.eraseFromParent();
24110 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24111 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24112 // in the .td file.
24113 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24114 const TargetInstrInfo *TII) {
24116 switch (MI.getOpcode()) {
24117 default: llvm_unreachable("illegal opcode!");
24118 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24119 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24120 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24121 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24122 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24123 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24124 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24125 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24128 DebugLoc dl = MI.getDebugLoc();
24129 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24131 unsigned NumArgs = MI.getNumOperands();
24132 for (unsigned i = 1; i < NumArgs; ++i) {
24133 MachineOperand &Op = MI.getOperand(i);
24134 if (!(Op.isReg() && Op.isImplicit()))
24135 MIB.addOperand(Op);
24137 if (MI.hasOneMemOperand())
24138 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24140 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24141 .addReg(X86::XMM0);
24143 MI.eraseFromParent();
24147 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24148 // defs in an instruction pattern
24149 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24150 const TargetInstrInfo *TII) {
24152 switch (MI.getOpcode()) {
24153 default: llvm_unreachable("illegal opcode!");
24154 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24155 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24156 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24157 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24158 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24159 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24160 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24161 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24164 DebugLoc dl = MI.getDebugLoc();
24165 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24167 unsigned NumArgs = MI.getNumOperands(); // remove the results
24168 for (unsigned i = 1; i < NumArgs; ++i) {
24169 MachineOperand &Op = MI.getOperand(i);
24170 if (!(Op.isReg() && Op.isImplicit()))
24171 MIB.addOperand(Op);
24173 if (MI.hasOneMemOperand())
24174 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24176 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24179 MI.eraseFromParent();
24183 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24184 const X86Subtarget &Subtarget) {
24185 DebugLoc dl = MI.getDebugLoc();
24186 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24188 // insert input VAL into EAX
24189 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24190 .addReg(MI.getOperand(0).getReg());
24191 // insert zero to ECX
24192 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24194 // insert zero to EDX
24195 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24197 // insert WRPKRU instruction
24198 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24200 MI.eraseFromParent(); // The pseudo is gone now.
24204 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24205 const X86Subtarget &Subtarget) {
24206 DebugLoc dl = MI.getDebugLoc();
24207 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24209 // insert zero to ECX
24210 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24212 // insert RDPKRU instruction
24213 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24214 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24217 MI.eraseFromParent(); // The pseudo is gone now.
24221 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24222 const X86Subtarget &Subtarget,
24224 DebugLoc dl = MI.getDebugLoc();
24225 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24226 // Address into RAX/EAX, other two args into ECX, EDX.
24227 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24228 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24229 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24230 for (int i = 0; i < X86::AddrNumOperands; ++i)
24231 MIB.addOperand(MI.getOperand(i));
24233 unsigned ValOps = X86::AddrNumOperands;
24234 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24235 .addReg(MI.getOperand(ValOps).getReg());
24236 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24237 .addReg(MI.getOperand(ValOps + 1).getReg());
24239 // The instruction doesn't actually take any operands though.
24240 BuildMI(*BB, MI, dl, TII->get(Opc));
24242 MI.eraseFromParent(); // The pseudo is gone now.
24246 MachineBasicBlock *
24247 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24248 MachineBasicBlock *MBB) const {
24249 // Emit va_arg instruction on X86-64.
24251 // Operands to this pseudo-instruction:
24252 // 0 ) Output : destination address (reg)
24253 // 1-5) Input : va_list address (addr, i64mem)
24254 // 6 ) ArgSize : Size (in bytes) of vararg type
24255 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24256 // 8 ) Align : Alignment of type
24257 // 9 ) EFLAGS (implicit-def)
24259 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24260 static_assert(X86::AddrNumOperands == 5,
24261 "VAARG_64 assumes 5 address operands");
24263 unsigned DestReg = MI.getOperand(0).getReg();
24264 MachineOperand &Base = MI.getOperand(1);
24265 MachineOperand &Scale = MI.getOperand(2);
24266 MachineOperand &Index = MI.getOperand(3);
24267 MachineOperand &Disp = MI.getOperand(4);
24268 MachineOperand &Segment = MI.getOperand(5);
24269 unsigned ArgSize = MI.getOperand(6).getImm();
24270 unsigned ArgMode = MI.getOperand(7).getImm();
24271 unsigned Align = MI.getOperand(8).getImm();
24273 // Memory Reference
24274 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24275 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24276 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24278 // Machine Information
24279 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24280 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24281 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24282 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24283 DebugLoc DL = MI.getDebugLoc();
24285 // struct va_list {
24288 // i64 overflow_area (address)
24289 // i64 reg_save_area (address)
24291 // sizeof(va_list) = 24
24292 // alignment(va_list) = 8
24294 unsigned TotalNumIntRegs = 6;
24295 unsigned TotalNumXMMRegs = 8;
24296 bool UseGPOffset = (ArgMode == 1);
24297 bool UseFPOffset = (ArgMode == 2);
24298 unsigned MaxOffset = TotalNumIntRegs * 8 +
24299 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24301 /* Align ArgSize to a multiple of 8 */
24302 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24303 bool NeedsAlign = (Align > 8);
24305 MachineBasicBlock *thisMBB = MBB;
24306 MachineBasicBlock *overflowMBB;
24307 MachineBasicBlock *offsetMBB;
24308 MachineBasicBlock *endMBB;
24310 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24311 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24312 unsigned OffsetReg = 0;
24314 if (!UseGPOffset && !UseFPOffset) {
24315 // If we only pull from the overflow region, we don't create a branch.
24316 // We don't need to alter control flow.
24317 OffsetDestReg = 0; // unused
24318 OverflowDestReg = DestReg;
24320 offsetMBB = nullptr;
24321 overflowMBB = thisMBB;
24324 // First emit code to check if gp_offset (or fp_offset) is below the bound.
24325 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24326 // If not, pull from overflow_area. (branch to overflowMBB)
24331 // offsetMBB overflowMBB
24336 // Registers for the PHI in endMBB
24337 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24338 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24340 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24341 MachineFunction *MF = MBB->getParent();
24342 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24343 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24344 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24346 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24348 // Insert the new basic blocks
24349 MF->insert(MBBIter, offsetMBB);
24350 MF->insert(MBBIter, overflowMBB);
24351 MF->insert(MBBIter, endMBB);
24353 // Transfer the remainder of MBB and its successor edges to endMBB.
24354 endMBB->splice(endMBB->begin(), thisMBB,
24355 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
24356 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
24358 // Make offsetMBB and overflowMBB successors of thisMBB
24359 thisMBB->addSuccessor(offsetMBB);
24360 thisMBB->addSuccessor(overflowMBB);
24362 // endMBB is a successor of both offsetMBB and overflowMBB
24363 offsetMBB->addSuccessor(endMBB);
24364 overflowMBB->addSuccessor(endMBB);
24366 // Load the offset value into a register
24367 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24368 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
24372 .addDisp(Disp, UseFPOffset ? 4 : 0)
24373 .addOperand(Segment)
24374 .setMemRefs(MMOBegin, MMOEnd);
24376 // Check if there is enough room left to pull this argument.
24377 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
24379 .addImm(MaxOffset + 8 - ArgSizeA8);
24381 // Branch to "overflowMBB" if offset >= max
24382 // Fall through to "offsetMBB" otherwise
24383 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
24384 .addMBB(overflowMBB);
24387 // In offsetMBB, emit code to use the reg_save_area.
24389 assert(OffsetReg != 0);
24391 // Read the reg_save_area address.
24392 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
24393 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
24398 .addOperand(Segment)
24399 .setMemRefs(MMOBegin, MMOEnd);
24401 // Zero-extend the offset
24402 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
24403 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
24406 .addImm(X86::sub_32bit);
24408 // Add the offset to the reg_save_area to get the final address.
24409 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
24410 .addReg(OffsetReg64)
24411 .addReg(RegSaveReg);
24413 // Compute the offset for the next argument
24414 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24415 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
24417 .addImm(UseFPOffset ? 16 : 8);
24419 // Store it back into the va_list.
24420 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
24424 .addDisp(Disp, UseFPOffset ? 4 : 0)
24425 .addOperand(Segment)
24426 .addReg(NextOffsetReg)
24427 .setMemRefs(MMOBegin, MMOEnd);
24430 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
24435 // Emit code to use overflow area
24438 // Load the overflow_area address into a register.
24439 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
24440 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
24445 .addOperand(Segment)
24446 .setMemRefs(MMOBegin, MMOEnd);
24448 // If we need to align it, do so. Otherwise, just copy the address
24449 // to OverflowDestReg.
24451 // Align the overflow address
24452 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
24453 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
24455 // aligned_addr = (addr + (align-1)) & ~(align-1)
24456 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
24457 .addReg(OverflowAddrReg)
24460 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
24462 .addImm(~(uint64_t)(Align-1));
24464 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
24465 .addReg(OverflowAddrReg);
24468 // Compute the next overflow address after this argument.
24469 // (the overflow address should be kept 8-byte aligned)
24470 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
24471 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
24472 .addReg(OverflowDestReg)
24473 .addImm(ArgSizeA8);
24475 // Store the new overflow address.
24476 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
24481 .addOperand(Segment)
24482 .addReg(NextAddrReg)
24483 .setMemRefs(MMOBegin, MMOEnd);
24485 // If we branched, emit the PHI to the front of endMBB.
24487 BuildMI(*endMBB, endMBB->begin(), DL,
24488 TII->get(X86::PHI), DestReg)
24489 .addReg(OffsetDestReg).addMBB(offsetMBB)
24490 .addReg(OverflowDestReg).addMBB(overflowMBB);
24493 // Erase the pseudo instruction
24494 MI.eraseFromParent();
24499 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
24500 MachineInstr &MI, MachineBasicBlock *MBB) const {
24501 // Emit code to save XMM registers to the stack. The ABI says that the
24502 // number of registers to save is given in %al, so it's theoretically
24503 // possible to do an indirect jump trick to avoid saving all of them,
24504 // however this code takes a simpler approach and just executes all
24505 // of the stores if %al is non-zero. It's less code, and it's probably
24506 // easier on the hardware branch predictor, and stores aren't all that
24507 // expensive anyway.
24509 // Create the new basic blocks. One block contains all the XMM stores,
24510 // and one block is the final destination regardless of whether any
24511 // stores were performed.
24512 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24513 MachineFunction *F = MBB->getParent();
24514 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24515 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
24516 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
24517 F->insert(MBBIter, XMMSaveMBB);
24518 F->insert(MBBIter, EndMBB);
24520 // Transfer the remainder of MBB and its successor edges to EndMBB.
24521 EndMBB->splice(EndMBB->begin(), MBB,
24522 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24523 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
24525 // The original block will now fall through to the XMM save block.
24526 MBB->addSuccessor(XMMSaveMBB);
24527 // The XMMSaveMBB will fall through to the end block.
24528 XMMSaveMBB->addSuccessor(EndMBB);
24530 // Now add the instructions.
24531 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24532 DebugLoc DL = MI.getDebugLoc();
24534 unsigned CountReg = MI.getOperand(0).getReg();
24535 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
24536 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
24538 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
24539 // If %al is 0, branch around the XMM save block.
24540 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
24541 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
24542 MBB->addSuccessor(EndMBB);
24545 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
24546 // that was just emitted, but clearly shouldn't be "saved".
24547 assert((MI.getNumOperands() <= 3 ||
24548 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
24549 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
24550 "Expected last argument to be EFLAGS");
24551 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
24552 // In the XMM save block, save all the XMM argument registers.
24553 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
24554 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
24555 MachineMemOperand *MMO = F->getMachineMemOperand(
24556 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
24557 MachineMemOperand::MOStore,
24558 /*Size=*/16, /*Align=*/16);
24559 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
24560 .addFrameIndex(RegSaveFrameIndex)
24561 .addImm(/*Scale=*/1)
24562 .addReg(/*IndexReg=*/0)
24563 .addImm(/*Disp=*/Offset)
24564 .addReg(/*Segment=*/0)
24565 .addReg(MI.getOperand(i).getReg())
24566 .addMemOperand(MMO);
24569 MI.eraseFromParent(); // The pseudo instruction is gone now.
24574 // The EFLAGS operand of SelectItr might be missing a kill marker
24575 // because there were multiple uses of EFLAGS, and ISel didn't know
24576 // which to mark. Figure out whether SelectItr should have had a
24577 // kill marker, and set it if it should. Returns the correct kill
24579 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
24580 MachineBasicBlock* BB,
24581 const TargetRegisterInfo* TRI) {
24582 // Scan forward through BB for a use/def of EFLAGS.
24583 MachineBasicBlock::iterator miI(std::next(SelectItr));
24584 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
24585 const MachineInstr& mi = *miI;
24586 if (mi.readsRegister(X86::EFLAGS))
24588 if (mi.definesRegister(X86::EFLAGS))
24589 break; // Should have kill-flag - update below.
24592 // If we hit the end of the block, check whether EFLAGS is live into a
24594 if (miI == BB->end()) {
24595 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
24596 sEnd = BB->succ_end();
24597 sItr != sEnd; ++sItr) {
24598 MachineBasicBlock* succ = *sItr;
24599 if (succ->isLiveIn(X86::EFLAGS))
24604 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
24605 // out. SelectMI should have a kill flag on EFLAGS.
24606 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
24610 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
24611 // together with other CMOV pseudo-opcodes into a single basic-block with
24612 // conditional jump around it.
24613 static bool isCMOVPseudo(MachineInstr &MI) {
24614 switch (MI.getOpcode()) {
24615 case X86::CMOV_FR32:
24616 case X86::CMOV_FR64:
24617 case X86::CMOV_GR8:
24618 case X86::CMOV_GR16:
24619 case X86::CMOV_GR32:
24620 case X86::CMOV_RFP32:
24621 case X86::CMOV_RFP64:
24622 case X86::CMOV_RFP80:
24623 case X86::CMOV_V2F64:
24624 case X86::CMOV_V2I64:
24625 case X86::CMOV_V4F32:
24626 case X86::CMOV_V4F64:
24627 case X86::CMOV_V4I64:
24628 case X86::CMOV_V16F32:
24629 case X86::CMOV_V8F32:
24630 case X86::CMOV_V8F64:
24631 case X86::CMOV_V8I64:
24632 case X86::CMOV_V8I1:
24633 case X86::CMOV_V16I1:
24634 case X86::CMOV_V32I1:
24635 case X86::CMOV_V64I1:
24643 MachineBasicBlock *
24644 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
24645 MachineBasicBlock *BB) const {
24646 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24647 DebugLoc DL = MI.getDebugLoc();
24649 // To "insert" a SELECT_CC instruction, we actually have to insert the
24650 // diamond control-flow pattern. The incoming instruction knows the
24651 // destination vreg to set, the condition code register to branch on, the
24652 // true/false values to select between, and a branch opcode to use.
24653 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24654 MachineFunction::iterator It = ++BB->getIterator();
24659 // cmpTY ccX, r1, r2
24661 // fallthrough --> copy0MBB
24662 MachineBasicBlock *thisMBB = BB;
24663 MachineFunction *F = BB->getParent();
24665 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
24666 // as described above, by inserting a BB, and then making a PHI at the join
24667 // point to select the true and false operands of the CMOV in the PHI.
24669 // The code also handles two different cases of multiple CMOV opcodes
24673 // In this case, there are multiple CMOVs in a row, all which are based on
24674 // the same condition setting (or the exact opposite condition setting).
24675 // In this case we can lower all the CMOVs using a single inserted BB, and
24676 // then make a number of PHIs at the join point to model the CMOVs. The only
24677 // trickiness here, is that in a case like:
24679 // t2 = CMOV cond1 t1, f1
24680 // t3 = CMOV cond1 t2, f2
24682 // when rewriting this into PHIs, we have to perform some renaming on the
24683 // temps since you cannot have a PHI operand refer to a PHI result earlier
24684 // in the same block. The "simple" but wrong lowering would be:
24686 // t2 = PHI t1(BB1), f1(BB2)
24687 // t3 = PHI t2(BB1), f2(BB2)
24689 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
24690 // renaming is to note that on the path through BB1, t2 is really just a
24691 // copy of t1, and do that renaming, properly generating:
24693 // t2 = PHI t1(BB1), f1(BB2)
24694 // t3 = PHI t1(BB1), f2(BB2)
24696 // Case 2, we lower cascaded CMOVs such as
24698 // (CMOV (CMOV F, T, cc1), T, cc2)
24700 // to two successives branches. For that, we look for another CMOV as the
24701 // following instruction.
24703 // Without this, we would add a PHI between the two jumps, which ends up
24704 // creating a few copies all around. For instance, for
24706 // (sitofp (zext (fcmp une)))
24708 // we would generate:
24710 // ucomiss %xmm1, %xmm0
24711 // movss <1.0f>, %xmm0
24712 // movaps %xmm0, %xmm1
24714 // xorps %xmm1, %xmm1
24717 // movaps %xmm1, %xmm0
24721 // because this custom-inserter would have generated:
24733 // A: X = ...; Y = ...
24735 // C: Z = PHI [X, A], [Y, B]
24737 // E: PHI [X, C], [Z, D]
24739 // If we lower both CMOVs in a single step, we can instead generate:
24751 // A: X = ...; Y = ...
24753 // E: PHI [X, A], [X, C], [Y, D]
24755 // Which, in our sitofp/fcmp example, gives us something like:
24757 // ucomiss %xmm1, %xmm0
24758 // movss <1.0f>, %xmm0
24761 // xorps %xmm0, %xmm0
24765 MachineInstr *CascadedCMOV = nullptr;
24766 MachineInstr *LastCMOV = &MI;
24767 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
24768 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
24769 MachineBasicBlock::iterator NextMIIt =
24770 std::next(MachineBasicBlock::iterator(MI));
24772 // Check for case 1, where there are multiple CMOVs with the same condition
24773 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
24774 // number of jumps the most.
24776 if (isCMOVPseudo(MI)) {
24777 // See if we have a string of CMOVS with the same condition.
24778 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
24779 (NextMIIt->getOperand(3).getImm() == CC ||
24780 NextMIIt->getOperand(3).getImm() == OppCC)) {
24781 LastCMOV = &*NextMIIt;
24786 // This checks for case 2, but only do this if we didn't already find
24787 // case 1, as indicated by LastCMOV == MI.
24788 if (LastCMOV == &MI && NextMIIt != BB->end() &&
24789 NextMIIt->getOpcode() == MI.getOpcode() &&
24790 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
24791 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
24792 NextMIIt->getOperand(1).isKill()) {
24793 CascadedCMOV = &*NextMIIt;
24796 MachineBasicBlock *jcc1MBB = nullptr;
24798 // If we have a cascaded CMOV, we lower it to two successive branches to
24799 // the same block. EFLAGS is used by both, so mark it as live in the second.
24800 if (CascadedCMOV) {
24801 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
24802 F->insert(It, jcc1MBB);
24803 jcc1MBB->addLiveIn(X86::EFLAGS);
24806 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
24807 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
24808 F->insert(It, copy0MBB);
24809 F->insert(It, sinkMBB);
24811 // If the EFLAGS register isn't dead in the terminator, then claim that it's
24812 // live into the sink and copy blocks.
24813 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
24815 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
24816 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
24817 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
24818 copy0MBB->addLiveIn(X86::EFLAGS);
24819 sinkMBB->addLiveIn(X86::EFLAGS);
24822 // Transfer the remainder of BB and its successor edges to sinkMBB.
24823 sinkMBB->splice(sinkMBB->begin(), BB,
24824 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
24825 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
24827 // Add the true and fallthrough blocks as its successors.
24828 if (CascadedCMOV) {
24829 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
24830 BB->addSuccessor(jcc1MBB);
24832 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
24833 // jump to the sinkMBB.
24834 jcc1MBB->addSuccessor(copy0MBB);
24835 jcc1MBB->addSuccessor(sinkMBB);
24837 BB->addSuccessor(copy0MBB);
24840 // The true block target of the first (or only) branch is always sinkMBB.
24841 BB->addSuccessor(sinkMBB);
24843 // Create the conditional branch instruction.
24844 unsigned Opc = X86::GetCondBranchFromCond(CC);
24845 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
24847 if (CascadedCMOV) {
24848 unsigned Opc2 = X86::GetCondBranchFromCond(
24849 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
24850 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
24854 // %FalseValue = ...
24855 // # fallthrough to sinkMBB
24856 copy0MBB->addSuccessor(sinkMBB);
24859 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
24861 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
24862 MachineBasicBlock::iterator MIItEnd =
24863 std::next(MachineBasicBlock::iterator(LastCMOV));
24864 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
24865 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
24866 MachineInstrBuilder MIB;
24868 // As we are creating the PHIs, we have to be careful if there is more than
24869 // one. Later CMOVs may reference the results of earlier CMOVs, but later
24870 // PHIs have to reference the individual true/false inputs from earlier PHIs.
24871 // That also means that PHI construction must work forward from earlier to
24872 // later, and that the code must maintain a mapping from earlier PHI's
24873 // destination registers, and the registers that went into the PHI.
24875 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
24876 unsigned DestReg = MIIt->getOperand(0).getReg();
24877 unsigned Op1Reg = MIIt->getOperand(1).getReg();
24878 unsigned Op2Reg = MIIt->getOperand(2).getReg();
24880 // If this CMOV we are generating is the opposite condition from
24881 // the jump we generated, then we have to swap the operands for the
24882 // PHI that is going to be generated.
24883 if (MIIt->getOperand(3).getImm() == OppCC)
24884 std::swap(Op1Reg, Op2Reg);
24886 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
24887 Op1Reg = RegRewriteTable[Op1Reg].first;
24889 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
24890 Op2Reg = RegRewriteTable[Op2Reg].second;
24892 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
24893 TII->get(X86::PHI), DestReg)
24894 .addReg(Op1Reg).addMBB(copy0MBB)
24895 .addReg(Op2Reg).addMBB(thisMBB);
24897 // Add this PHI to the rewrite table.
24898 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
24901 // If we have a cascaded CMOV, the second Jcc provides the same incoming
24902 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
24903 if (CascadedCMOV) {
24904 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
24905 // Copy the PHI result to the register defined by the second CMOV.
24906 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
24907 DL, TII->get(TargetOpcode::COPY),
24908 CascadedCMOV->getOperand(0).getReg())
24909 .addReg(MI.getOperand(0).getReg());
24910 CascadedCMOV->eraseFromParent();
24913 // Now remove the CMOV(s).
24914 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
24915 (MIIt++)->eraseFromParent();
24920 MachineBasicBlock *
24921 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
24922 MachineBasicBlock *BB) const {
24923 // Combine the following atomic floating-point modification pattern:
24924 // a.store(reg OP a.load(acquire), release)
24925 // Transform them into:
24926 // OPss (%gpr), %xmm
24927 // movss %xmm, (%gpr)
24928 // Or sd equivalent for 64-bit operations.
24930 switch (MI.getOpcode()) {
24931 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
24932 case X86::RELEASE_FADD32mr:
24933 FOp = X86::ADDSSrm;
24934 MOp = X86::MOVSSmr;
24936 case X86::RELEASE_FADD64mr:
24937 FOp = X86::ADDSDrm;
24938 MOp = X86::MOVSDmr;
24941 const X86InstrInfo *TII = Subtarget.getInstrInfo();
24942 DebugLoc DL = MI.getDebugLoc();
24943 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
24944 unsigned ValOpIdx = X86::AddrNumOperands;
24945 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
24946 MachineInstrBuilder MIB =
24947 BuildMI(*BB, MI, DL, TII->get(FOp),
24948 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
24950 for (int i = 0; i < X86::AddrNumOperands; ++i) {
24951 MachineOperand &Operand = MI.getOperand(i);
24952 // Clear any kill flags on register operands as we'll create a second
24953 // instruction using the same address operands.
24954 if (Operand.isReg())
24955 Operand.setIsKill(false);
24956 MIB.addOperand(Operand);
24958 MachineInstr *FOpMI = MIB;
24959 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
24960 for (int i = 0; i < X86::AddrNumOperands; ++i)
24961 MIB.addOperand(MI.getOperand(i));
24962 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
24963 MI.eraseFromParent(); // The pseudo instruction is gone now.
24967 MachineBasicBlock *
24968 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
24969 MachineBasicBlock *BB) const {
24970 MachineFunction *MF = BB->getParent();
24971 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24972 DebugLoc DL = MI.getDebugLoc();
24973 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24975 assert(MF->shouldSplitStack());
24977 const bool Is64Bit = Subtarget.is64Bit();
24978 const bool IsLP64 = Subtarget.isTarget64BitLP64();
24980 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
24981 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
24984 // ... [Till the alloca]
24985 // If stacklet is not large enough, jump to mallocMBB
24988 // Allocate by subtracting from RSP
24989 // Jump to continueMBB
24992 // Allocate by call to runtime
24996 // [rest of original BB]
24999 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25000 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25001 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25003 MachineRegisterInfo &MRI = MF->getRegInfo();
25004 const TargetRegisterClass *AddrRegClass =
25005 getRegClassFor(getPointerTy(MF->getDataLayout()));
25007 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25008 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25009 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25010 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25011 sizeVReg = MI.getOperand(1).getReg(),
25013 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25015 MachineFunction::iterator MBBIter = ++BB->getIterator();
25017 MF->insert(MBBIter, bumpMBB);
25018 MF->insert(MBBIter, mallocMBB);
25019 MF->insert(MBBIter, continueMBB);
25021 continueMBB->splice(continueMBB->begin(), BB,
25022 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25023 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25025 // Add code to the main basic block to check if the stack limit has been hit,
25026 // and if so, jump to mallocMBB otherwise to bumpMBB.
25027 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25028 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25029 .addReg(tmpSPVReg).addReg(sizeVReg);
25030 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25031 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25032 .addReg(SPLimitVReg);
25033 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25035 // bumpMBB simply decreases the stack pointer, since we know the current
25036 // stacklet has enough space.
25037 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25038 .addReg(SPLimitVReg);
25039 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25040 .addReg(SPLimitVReg);
25041 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25043 // Calls into a routine in libgcc to allocate more space from the heap.
25044 const uint32_t *RegMask =
25045 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25047 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25049 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25050 .addExternalSymbol("__morestack_allocate_stack_space")
25051 .addRegMask(RegMask)
25052 .addReg(X86::RDI, RegState::Implicit)
25053 .addReg(X86::RAX, RegState::ImplicitDefine);
25054 } else if (Is64Bit) {
25055 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25057 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25058 .addExternalSymbol("__morestack_allocate_stack_space")
25059 .addRegMask(RegMask)
25060 .addReg(X86::EDI, RegState::Implicit)
25061 .addReg(X86::EAX, RegState::ImplicitDefine);
25063 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25065 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25066 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25067 .addExternalSymbol("__morestack_allocate_stack_space")
25068 .addRegMask(RegMask)
25069 .addReg(X86::EAX, RegState::ImplicitDefine);
25073 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25076 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25077 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25078 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25080 // Set up the CFG correctly.
25081 BB->addSuccessor(bumpMBB);
25082 BB->addSuccessor(mallocMBB);
25083 mallocMBB->addSuccessor(continueMBB);
25084 bumpMBB->addSuccessor(continueMBB);
25086 // Take care of the PHI nodes.
25087 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25088 MI.getOperand(0).getReg())
25089 .addReg(mallocPtrVReg)
25091 .addReg(bumpSPPtrVReg)
25094 // Delete the original pseudo instruction.
25095 MI.eraseFromParent();
25098 return continueMBB;
25101 MachineBasicBlock *
25102 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25103 MachineBasicBlock *BB) const {
25104 MachineFunction *MF = BB->getParent();
25105 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25106 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25107 DebugLoc DL = MI.getDebugLoc();
25109 assert(!isAsynchronousEHPersonality(
25110 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25111 "SEH does not use catchret!");
25113 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25114 if (!Subtarget.is32Bit())
25117 // C++ EH creates a new target block to hold the restore code, and wires up
25118 // the new block to the return destination with a normal JMP_4.
25119 MachineBasicBlock *RestoreMBB =
25120 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25121 assert(BB->succ_size() == 1);
25122 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25123 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25124 BB->addSuccessor(RestoreMBB);
25125 MI.getOperand(0).setMBB(RestoreMBB);
25127 auto RestoreMBBI = RestoreMBB->begin();
25128 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25129 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25133 MachineBasicBlock *
25134 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25135 MachineBasicBlock *BB) const {
25136 MachineFunction *MF = BB->getParent();
25137 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25138 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25139 // Only 32-bit SEH requires special handling for catchpad.
25140 if (IsSEH && Subtarget.is32Bit()) {
25141 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25142 DebugLoc DL = MI.getDebugLoc();
25143 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25145 MI.eraseFromParent();
25149 MachineBasicBlock *
25150 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25151 MachineBasicBlock *BB) const {
25152 // So, here we replace TLSADDR with the sequence:
25153 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25154 // We need this because TLSADDR is lowered into calls
25155 // inside MC, therefore without the two markers shrink-wrapping
25156 // may push the prologue/epilogue pass them.
25157 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25158 DebugLoc DL = MI.getDebugLoc();
25159 MachineFunction &MF = *BB->getParent();
25161 // Emit CALLSEQ_START right before the instruction.
25162 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25163 MachineInstrBuilder CallseqStart =
25164 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25165 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25167 // Emit CALLSEQ_END right after the instruction.
25168 // We don't call erase from parent because we want to keep the
25169 // original instruction around.
25170 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25171 MachineInstrBuilder CallseqEnd =
25172 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25173 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25178 MachineBasicBlock *
25179 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25180 MachineBasicBlock *BB) const {
25181 // This is pretty easy. We're taking the value that we received from
25182 // our load from the relocation, sticking it in either RDI (x86-64)
25183 // or EAX and doing an indirect call. The return value will then
25184 // be in the normal return register.
25185 MachineFunction *F = BB->getParent();
25186 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25187 DebugLoc DL = MI.getDebugLoc();
25189 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25190 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25192 // Get a register mask for the lowered call.
25193 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25194 // proper register mask.
25195 const uint32_t *RegMask =
25196 Subtarget.is64Bit() ?
25197 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25198 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25199 if (Subtarget.is64Bit()) {
25200 MachineInstrBuilder MIB =
25201 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25205 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25206 MI.getOperand(3).getTargetFlags())
25208 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25209 addDirectMem(MIB, X86::RDI);
25210 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25211 } else if (!isPositionIndependent()) {
25212 MachineInstrBuilder MIB =
25213 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25217 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25218 MI.getOperand(3).getTargetFlags())
25220 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25221 addDirectMem(MIB, X86::EAX);
25222 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25224 MachineInstrBuilder MIB =
25225 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25226 .addReg(TII->getGlobalBaseReg(F))
25229 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25230 MI.getOperand(3).getTargetFlags())
25232 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25233 addDirectMem(MIB, X86::EAX);
25234 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25237 MI.eraseFromParent(); // The pseudo instruction is gone now.
25241 MachineBasicBlock *
25242 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25243 MachineBasicBlock *MBB) const {
25244 DebugLoc DL = MI.getDebugLoc();
25245 MachineFunction *MF = MBB->getParent();
25246 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25247 MachineRegisterInfo &MRI = MF->getRegInfo();
25249 const BasicBlock *BB = MBB->getBasicBlock();
25250 MachineFunction::iterator I = ++MBB->getIterator();
25252 // Memory Reference
25253 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25254 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25257 unsigned MemOpndSlot = 0;
25259 unsigned CurOp = 0;
25261 DstReg = MI.getOperand(CurOp++).getReg();
25262 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25263 assert(RC->hasType(MVT::i32) && "Invalid destination!");
25264 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25265 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25267 MemOpndSlot = CurOp;
25269 MVT PVT = getPointerTy(MF->getDataLayout());
25270 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25271 "Invalid Pointer Size!");
25273 // For v = setjmp(buf), we generate
25276 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25277 // SjLjSetup restoreMBB
25283 // v = phi(main, restore)
25286 // if base pointer being used, load it from frame
25289 MachineBasicBlock *thisMBB = MBB;
25290 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25291 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25292 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25293 MF->insert(I, mainMBB);
25294 MF->insert(I, sinkMBB);
25295 MF->push_back(restoreMBB);
25296 restoreMBB->setHasAddressTaken();
25298 MachineInstrBuilder MIB;
25300 // Transfer the remainder of BB and its successor edges to sinkMBB.
25301 sinkMBB->splice(sinkMBB->begin(), MBB,
25302 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25303 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25306 unsigned PtrStoreOpc = 0;
25307 unsigned LabelReg = 0;
25308 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25309 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25310 !isPositionIndependent();
25312 // Prepare IP either in reg or imm.
25313 if (!UseImmLabel) {
25314 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25315 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25316 LabelReg = MRI.createVirtualRegister(PtrRC);
25317 if (Subtarget.is64Bit()) {
25318 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25322 .addMBB(restoreMBB)
25325 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25326 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25327 .addReg(XII->getGlobalBaseReg(MF))
25330 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25334 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25336 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25337 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25338 if (i == X86::AddrDisp)
25339 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
25341 MIB.addOperand(MI.getOperand(MemOpndSlot + i));
25344 MIB.addReg(LabelReg);
25346 MIB.addMBB(restoreMBB);
25347 MIB.setMemRefs(MMOBegin, MMOEnd);
25349 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
25350 .addMBB(restoreMBB);
25352 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25353 MIB.addRegMask(RegInfo->getNoPreservedMask());
25354 thisMBB->addSuccessor(mainMBB);
25355 thisMBB->addSuccessor(restoreMBB);
25359 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
25360 mainMBB->addSuccessor(sinkMBB);
25363 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
25364 TII->get(X86::PHI), DstReg)
25365 .addReg(mainDstReg).addMBB(mainMBB)
25366 .addReg(restoreDstReg).addMBB(restoreMBB);
25369 if (RegInfo->hasBasePointer(*MF)) {
25370 const bool Uses64BitFramePtr =
25371 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25372 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
25373 X86FI->setRestoreBasePointer(MF);
25374 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
25375 unsigned BasePtr = RegInfo->getBaseRegister();
25376 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
25377 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
25378 FramePtr, true, X86FI->getRestoreBasePointerOffset())
25379 .setMIFlag(MachineInstr::FrameSetup);
25381 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
25382 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25383 restoreMBB->addSuccessor(sinkMBB);
25385 MI.eraseFromParent();
25389 MachineBasicBlock *
25390 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
25391 MachineBasicBlock *MBB) const {
25392 DebugLoc DL = MI.getDebugLoc();
25393 MachineFunction *MF = MBB->getParent();
25394 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25395 MachineRegisterInfo &MRI = MF->getRegInfo();
25397 // Memory Reference
25398 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25399 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25401 MVT PVT = getPointerTy(MF->getDataLayout());
25402 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25403 "Invalid Pointer Size!");
25405 const TargetRegisterClass *RC =
25406 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25407 unsigned Tmp = MRI.createVirtualRegister(RC);
25408 // Since FP is only updated here but NOT referenced, it's treated as GPR.
25409 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25410 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
25411 unsigned SP = RegInfo->getStackRegister();
25413 MachineInstrBuilder MIB;
25415 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25416 const int64_t SPOffset = 2 * PVT.getStoreSize();
25418 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
25419 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
25422 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
25423 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
25424 MIB.addOperand(MI.getOperand(i));
25425 MIB.setMemRefs(MMOBegin, MMOEnd);
25427 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
25428 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25429 if (i == X86::AddrDisp)
25430 MIB.addDisp(MI.getOperand(i), LabelOffset);
25432 MIB.addOperand(MI.getOperand(i));
25434 MIB.setMemRefs(MMOBegin, MMOEnd);
25436 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
25437 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25438 if (i == X86::AddrDisp)
25439 MIB.addDisp(MI.getOperand(i), SPOffset);
25441 MIB.addOperand(MI.getOperand(i));
25443 MIB.setMemRefs(MMOBegin, MMOEnd);
25445 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
25447 MI.eraseFromParent();
25451 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
25452 MachineBasicBlock *MBB,
25453 MachineBasicBlock *DispatchBB,
25455 DebugLoc DL = MI.getDebugLoc();
25456 MachineFunction *MF = MBB->getParent();
25457 MachineRegisterInfo *MRI = &MF->getRegInfo();
25458 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25460 MVT PVT = getPointerTy(MF->getDataLayout());
25461 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
25466 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25467 !isPositionIndependent();
25470 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25472 const TargetRegisterClass *TRC =
25473 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25474 VR = MRI->createVirtualRegister(TRC);
25475 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25477 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
25479 if (Subtarget.is64Bit())
25480 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
25484 .addMBB(DispatchBB)
25487 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
25488 .addReg(0) /* XII->getGlobalBaseReg(MF) */
25491 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
25495 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
25496 addFrameReference(MIB, FI, 36);
25498 MIB.addMBB(DispatchBB);
25503 MachineBasicBlock *
25504 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
25505 MachineBasicBlock *BB) const {
25506 DebugLoc DL = MI.getDebugLoc();
25507 MachineFunction *MF = BB->getParent();
25508 MachineFrameInfo &MFI = MF->getFrameInfo();
25509 MachineRegisterInfo *MRI = &MF->getRegInfo();
25510 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25511 int FI = MFI.getFunctionContextIndex();
25513 // Get a mapping of the call site numbers to all of the landing pads they're
25514 // associated with.
25515 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
25516 unsigned MaxCSNum = 0;
25517 for (auto &MBB : *MF) {
25518 if (!MBB.isEHPad())
25521 MCSymbol *Sym = nullptr;
25522 for (const auto &MI : MBB) {
25523 if (MI.isDebugValue())
25526 assert(MI.isEHLabel() && "expected EH_LABEL");
25527 Sym = MI.getOperand(0).getMCSymbol();
25531 if (!MF->hasCallSiteLandingPad(Sym))
25534 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
25535 CallSiteNumToLPad[CSI].push_back(&MBB);
25536 MaxCSNum = std::max(MaxCSNum, CSI);
25540 // Get an ordered list of the machine basic blocks for the jump table.
25541 std::vector<MachineBasicBlock *> LPadList;
25542 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
25543 LPadList.reserve(CallSiteNumToLPad.size());
25545 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
25546 for (auto &LP : CallSiteNumToLPad[CSI]) {
25547 LPadList.push_back(LP);
25548 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
25552 assert(!LPadList.empty() &&
25553 "No landing pad destinations for the dispatch jump table!");
25555 // Create the MBBs for the dispatch code.
25557 // Shove the dispatch's address into the return slot in the function context.
25558 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
25559 DispatchBB->setIsEHPad(true);
25561 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
25562 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
25563 DispatchBB->addSuccessor(TrapBB);
25565 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
25566 DispatchBB->addSuccessor(DispContBB);
25569 MF->push_back(DispatchBB);
25570 MF->push_back(DispContBB);
25571 MF->push_back(TrapBB);
25573 // Insert code into the entry block that creates and registers the function
25575 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
25577 // Create the jump table and associated information
25578 MachineJumpTableInfo *JTI =
25579 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
25580 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
25582 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
25583 const X86RegisterInfo &RI = XII->getRegisterInfo();
25585 // Add a register mask with no preserved registers. This results in all
25586 // registers being marked as clobbered.
25587 if (RI.hasBasePointer(*MF)) {
25588 const bool FPIs64Bit =
25589 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25590 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
25591 MFI->setRestoreBasePointer(MF);
25593 unsigned FP = RI.getFrameRegister(*MF);
25594 unsigned BP = RI.getBaseRegister();
25595 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
25596 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
25597 MFI->getRestoreBasePointerOffset())
25598 .addRegMask(RI.getNoPreservedMask());
25600 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
25601 .addRegMask(RI.getNoPreservedMask());
25604 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25605 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
25607 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
25609 .addImm(LPadList.size());
25610 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
25612 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25613 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
25616 BuildMI(DispContBB, DL,
25617 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
25619 .addImm(Subtarget.is64Bit() ? 8 : 4)
25621 .addJumpTableIndex(MJTI)
25624 // Add the jump table entries as successors to the MBB.
25625 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
25626 for (auto &LP : LPadList)
25627 if (SeenMBBs.insert(LP).second)
25628 DispContBB->addSuccessor(LP);
25630 // N.B. the order the invoke BBs are processed in doesn't matter here.
25631 SmallVector<MachineBasicBlock *, 64> MBBLPads;
25632 const MCPhysReg *SavedRegs =
25633 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
25634 for (MachineBasicBlock *MBB : InvokeBBs) {
25635 // Remove the landing pad successor from the invoke block and replace it
25636 // with the new dispatch block.
25637 // Keep a copy of Successors since it's modified inside the loop.
25638 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
25640 // FIXME: Avoid quadratic complexity.
25641 for (auto MBBS : Successors) {
25642 if (MBBS->isEHPad()) {
25643 MBB->removeSuccessor(MBBS);
25644 MBBLPads.push_back(MBBS);
25648 MBB->addSuccessor(DispatchBB);
25650 // Find the invoke call and mark all of the callee-saved registers as
25651 // 'implicit defined' so that they're spilled. This prevents code from
25652 // moving instructions to before the EH block, where they will never be
25654 for (auto &II : reverse(*MBB)) {
25658 DenseMap<unsigned, bool> DefRegs;
25659 for (auto &MOp : II.operands())
25661 DefRegs[MOp.getReg()] = true;
25663 MachineInstrBuilder MIB(*MF, &II);
25664 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
25665 unsigned Reg = SavedRegs[RI];
25667 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
25674 // Mark all former landing pads as non-landing pads. The dispatch is the only
25675 // landing pad now.
25676 for (auto &LP : MBBLPads)
25677 LP->setIsEHPad(false);
25679 // The instruction is gone now.
25680 MI.eraseFromParent();
25684 MachineBasicBlock *
25685 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
25686 MachineBasicBlock *BB) const {
25687 MachineFunction *MF = BB->getParent();
25688 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25689 DebugLoc DL = MI.getDebugLoc();
25691 switch (MI.getOpcode()) {
25692 default: llvm_unreachable("Unexpected instr type to insert");
25693 case X86::TAILJMPd64:
25694 case X86::TAILJMPr64:
25695 case X86::TAILJMPm64:
25696 case X86::TAILJMPr64_REX:
25697 case X86::TAILJMPm64_REX:
25698 llvm_unreachable("TAILJMP64 would not be touched here.");
25699 case X86::TCRETURNdi64:
25700 case X86::TCRETURNri64:
25701 case X86::TCRETURNmi64:
25703 case X86::TLS_addr32:
25704 case X86::TLS_addr64:
25705 case X86::TLS_base_addr32:
25706 case X86::TLS_base_addr64:
25707 return EmitLoweredTLSAddr(MI, BB);
25708 case X86::CATCHRET:
25709 return EmitLoweredCatchRet(MI, BB);
25710 case X86::CATCHPAD:
25711 return EmitLoweredCatchPad(MI, BB);
25712 case X86::SEG_ALLOCA_32:
25713 case X86::SEG_ALLOCA_64:
25714 return EmitLoweredSegAlloca(MI, BB);
25715 case X86::TLSCall_32:
25716 case X86::TLSCall_64:
25717 return EmitLoweredTLSCall(MI, BB);
25718 case X86::CMOV_FR32:
25719 case X86::CMOV_FR64:
25720 case X86::CMOV_FR128:
25721 case X86::CMOV_GR8:
25722 case X86::CMOV_GR16:
25723 case X86::CMOV_GR32:
25724 case X86::CMOV_RFP32:
25725 case X86::CMOV_RFP64:
25726 case X86::CMOV_RFP80:
25727 case X86::CMOV_V2F64:
25728 case X86::CMOV_V2I64:
25729 case X86::CMOV_V4F32:
25730 case X86::CMOV_V4F64:
25731 case X86::CMOV_V4I64:
25732 case X86::CMOV_V16F32:
25733 case X86::CMOV_V8F32:
25734 case X86::CMOV_V8F64:
25735 case X86::CMOV_V8I64:
25736 case X86::CMOV_V8I1:
25737 case X86::CMOV_V16I1:
25738 case X86::CMOV_V32I1:
25739 case X86::CMOV_V64I1:
25740 return EmitLoweredSelect(MI, BB);
25742 case X86::RDFLAGS32:
25743 case X86::RDFLAGS64: {
25745 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
25746 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
25747 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
25748 // Permit reads of the FLAGS register without it being defined.
25749 // This intrinsic exists to read external processor state in flags, such as
25750 // the trap flag, interrupt flag, and direction flag, none of which are
25751 // modeled by the backend.
25752 Push->getOperand(2).setIsUndef();
25753 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
25755 MI.eraseFromParent(); // The pseudo is gone now.
25759 case X86::WRFLAGS32:
25760 case X86::WRFLAGS64: {
25762 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
25764 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
25765 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
25766 BuildMI(*BB, MI, DL, TII->get(PopF));
25768 MI.eraseFromParent(); // The pseudo is gone now.
25772 case X86::RELEASE_FADD32mr:
25773 case X86::RELEASE_FADD64mr:
25774 return EmitLoweredAtomicFP(MI, BB);
25776 case X86::FP32_TO_INT16_IN_MEM:
25777 case X86::FP32_TO_INT32_IN_MEM:
25778 case X86::FP32_TO_INT64_IN_MEM:
25779 case X86::FP64_TO_INT16_IN_MEM:
25780 case X86::FP64_TO_INT32_IN_MEM:
25781 case X86::FP64_TO_INT64_IN_MEM:
25782 case X86::FP80_TO_INT16_IN_MEM:
25783 case X86::FP80_TO_INT32_IN_MEM:
25784 case X86::FP80_TO_INT64_IN_MEM: {
25785 // Change the floating point control register to use "round towards zero"
25786 // mode when truncating to an integer value.
25787 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
25788 addFrameReference(BuildMI(*BB, MI, DL,
25789 TII->get(X86::FNSTCW16m)), CWFrameIdx);
25791 // Load the old value of the high byte of the control word...
25793 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
25794 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
25797 // Set the high part to be round to zero...
25798 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
25801 // Reload the modified control word now...
25802 addFrameReference(BuildMI(*BB, MI, DL,
25803 TII->get(X86::FLDCW16m)), CWFrameIdx);
25805 // Restore the memory image of control word to original value
25806 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
25809 // Get the X86 opcode to use.
25811 switch (MI.getOpcode()) {
25812 default: llvm_unreachable("illegal opcode!");
25813 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
25814 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
25815 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
25816 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
25817 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
25818 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
25819 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
25820 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
25821 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
25824 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25825 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
25826 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
25828 // Reload the original control word now.
25829 addFrameReference(BuildMI(*BB, MI, DL,
25830 TII->get(X86::FLDCW16m)), CWFrameIdx);
25832 MI.eraseFromParent(); // The pseudo instruction is gone now.
25835 // String/text processing lowering.
25836 case X86::PCMPISTRM128REG:
25837 case X86::VPCMPISTRM128REG:
25838 case X86::PCMPISTRM128MEM:
25839 case X86::VPCMPISTRM128MEM:
25840 case X86::PCMPESTRM128REG:
25841 case X86::VPCMPESTRM128REG:
25842 case X86::PCMPESTRM128MEM:
25843 case X86::VPCMPESTRM128MEM:
25844 assert(Subtarget.hasSSE42() &&
25845 "Target must have SSE4.2 or AVX features enabled");
25846 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
25848 // String/text processing lowering.
25849 case X86::PCMPISTRIREG:
25850 case X86::VPCMPISTRIREG:
25851 case X86::PCMPISTRIMEM:
25852 case X86::VPCMPISTRIMEM:
25853 case X86::PCMPESTRIREG:
25854 case X86::VPCMPESTRIREG:
25855 case X86::PCMPESTRIMEM:
25856 case X86::VPCMPESTRIMEM:
25857 assert(Subtarget.hasSSE42() &&
25858 "Target must have SSE4.2 or AVX features enabled");
25859 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
25861 // Thread synchronization.
25863 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
25864 case X86::MONITORX:
25865 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
25868 return emitWRPKRU(MI, BB, Subtarget);
25870 return emitRDPKRU(MI, BB, Subtarget);
25873 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
25875 case X86::VASTART_SAVE_XMM_REGS:
25876 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
25878 case X86::VAARG_64:
25879 return EmitVAARG64WithCustomInserter(MI, BB);
25881 case X86::EH_SjLj_SetJmp32:
25882 case X86::EH_SjLj_SetJmp64:
25883 return emitEHSjLjSetJmp(MI, BB);
25885 case X86::EH_SjLj_LongJmp32:
25886 case X86::EH_SjLj_LongJmp64:
25887 return emitEHSjLjLongJmp(MI, BB);
25889 case X86::Int_eh_sjlj_setup_dispatch:
25890 return EmitSjLjDispatchBlock(MI, BB);
25892 case TargetOpcode::STATEPOINT:
25893 // As an implementation detail, STATEPOINT shares the STACKMAP format at
25894 // this point in the process. We diverge later.
25895 return emitPatchPoint(MI, BB);
25897 case TargetOpcode::STACKMAP:
25898 case TargetOpcode::PATCHPOINT:
25899 return emitPatchPoint(MI, BB);
25901 case X86::LCMPXCHG8B: {
25902 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25903 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
25904 // requires a memory operand. If it happens that current architecture is
25905 // i686 and for current function we need a base pointer
25906 // - which is ESI for i686 - register allocator would not be able to
25907 // allocate registers for an address in form of X(%reg, %reg, Y)
25908 // - there never would be enough unreserved registers during regalloc
25909 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
25910 // We are giving a hand to register allocator by precomputing the address in
25911 // a new vreg using LEA.
25913 // If it is not i686 or there is no base pointer - nothing to do here.
25914 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
25917 // Even though this code does not necessarily needs the base pointer to
25918 // be ESI, we check for that. The reason: if this assert fails, there are
25919 // some changes happened in the compiler base pointer handling, which most
25920 // probably have to be addressed somehow here.
25921 assert(TRI->getBaseRegister() == X86::ESI &&
25922 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
25923 "base pointer in mind");
25925 MachineRegisterInfo &MRI = MF->getRegInfo();
25926 MVT SPTy = getPointerTy(MF->getDataLayout());
25927 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25928 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
25930 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25931 // Regalloc does not need any help when the memory operand of CMPXCHG8B
25932 // does not use index register.
25933 if (AM.IndexReg == X86::NoRegister)
25936 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
25937 // four operand definitions that are E[ABCD] registers. We skip them and
25938 // then insert the LEA.
25939 MachineBasicBlock::iterator MBBI(MI);
25940 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
25941 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
25944 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
25946 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
25950 case X86::LCMPXCHG16B:
25952 case X86::LCMPXCHG8B_SAVE_EBX:
25953 case X86::LCMPXCHG16B_SAVE_RBX: {
25955 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
25956 if (!BB->isLiveIn(BasePtr))
25957 BB->addLiveIn(BasePtr);
25963 //===----------------------------------------------------------------------===//
25964 // X86 Optimization Hooks
25965 //===----------------------------------------------------------------------===//
25967 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
25970 const SelectionDAG &DAG,
25971 unsigned Depth) const {
25972 unsigned BitWidth = KnownZero.getBitWidth();
25973 unsigned Opc = Op.getOpcode();
25974 assert((Opc >= ISD::BUILTIN_OP_END ||
25975 Opc == ISD::INTRINSIC_WO_CHAIN ||
25976 Opc == ISD::INTRINSIC_W_CHAIN ||
25977 Opc == ISD::INTRINSIC_VOID) &&
25978 "Should use MaskedValueIsZero if you don't know whether Op"
25979 " is a target node!");
25981 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
25995 // These nodes' second result is a boolean.
25996 if (Op.getResNo() == 0)
25999 case X86ISD::SETCC:
26000 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
26002 case X86ISD::MOVMSK: {
26003 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26004 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
26007 case X86ISD::VZEXT: {
26008 SDValue N0 = Op.getOperand(0);
26009 unsigned NumElts = Op.getValueType().getVectorNumElements();
26010 unsigned InNumElts = N0.getValueType().getVectorNumElements();
26011 unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
26013 KnownZero = KnownOne = APInt(InBitWidth, 0);
26014 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
26015 DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
26016 KnownOne = KnownOne.zext(BitWidth);
26017 KnownZero = KnownZero.zext(BitWidth);
26018 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
26024 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26025 SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
26026 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26027 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
26028 return Op.getScalarValueSizeInBits();
26030 if (Op.getOpcode() == X86ISD::VSEXT) {
26031 EVT VT = Op.getValueType();
26032 EVT SrcVT = Op.getOperand(0).getValueType();
26033 unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
26034 Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
26042 /// Returns true (and the GlobalValue and the offset) if the node is a
26043 /// GlobalAddress + offset.
26044 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26045 const GlobalValue* &GA,
26046 int64_t &Offset) const {
26047 if (N->getOpcode() == X86ISD::Wrapper) {
26048 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26049 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26050 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26054 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26057 // Attempt to match a combined shuffle mask against supported unary shuffle
26059 // TODO: Investigate sharing more of this with shuffle lowering.
26060 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26062 const X86Subtarget &Subtarget,
26063 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26064 unsigned NumMaskElts = Mask.size();
26065 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26067 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26068 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26069 isUndefOrEqual(Mask[0], 0) &&
26070 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26071 Shuffle = X86ISD::VZEXT_MOVL;
26072 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26076 // Match against a VZEXT instruction.
26077 // TODO: Add 256/512-bit vector support.
26078 if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
26079 unsigned MaxScale = 64 / MaskEltSize;
26080 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26082 unsigned NumDstElts = NumMaskElts / Scale;
26083 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26084 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26085 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26089 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26090 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26091 Shuffle = X86ISD::VZEXT;
26097 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26098 // instructions are no slower than UNPCKLPD but has the option to
26099 // fold the input operand into even an unaligned memory load.
26100 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
26101 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26102 Shuffle = X86ISD::MOVDDUP;
26103 SrcVT = DstVT = MVT::v2f64;
26106 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26107 Shuffle = X86ISD::MOVSLDUP;
26108 SrcVT = DstVT = MVT::v4f32;
26111 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26112 Shuffle = X86ISD::MOVSHDUP;
26113 SrcVT = DstVT = MVT::v4f32;
26118 if (MaskVT.is256BitVector() && FloatDomain) {
26119 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26120 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26121 Shuffle = X86ISD::MOVDDUP;
26122 SrcVT = DstVT = MVT::v4f64;
26125 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26126 Shuffle = X86ISD::MOVSLDUP;
26127 SrcVT = DstVT = MVT::v8f32;
26130 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26131 Shuffle = X86ISD::MOVSHDUP;
26132 SrcVT = DstVT = MVT::v8f32;
26137 if (MaskVT.is512BitVector() && FloatDomain) {
26138 assert(Subtarget.hasAVX512() &&
26139 "AVX512 required for 512-bit vector shuffles");
26140 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26141 Shuffle = X86ISD::MOVDDUP;
26142 SrcVT = DstVT = MVT::v8f64;
26145 if (isTargetShuffleEquivalent(
26146 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26147 Shuffle = X86ISD::MOVSLDUP;
26148 SrcVT = DstVT = MVT::v16f32;
26151 if (isTargetShuffleEquivalent(
26152 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26153 Shuffle = X86ISD::MOVSHDUP;
26154 SrcVT = DstVT = MVT::v16f32;
26159 // Attempt to match against broadcast-from-vector.
26160 if (Subtarget.hasAVX2()) {
26161 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26162 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26163 SrcVT = DstVT = MaskVT;
26164 Shuffle = X86ISD::VBROADCAST;
26172 // Attempt to match a combined shuffle mask against supported unary immediate
26173 // permute instructions.
26174 // TODO: Investigate sharing more of this with shuffle lowering.
26175 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26177 const X86Subtarget &Subtarget,
26178 unsigned &Shuffle, MVT &ShuffleVT,
26179 unsigned &PermuteImm) {
26180 unsigned NumMaskElts = Mask.size();
26182 bool ContainsZeros = false;
26183 SmallBitVector Zeroable(NumMaskElts, false);
26184 for (unsigned i = 0; i != NumMaskElts; ++i) {
26186 Zeroable[i] = isUndefOrZero(M);
26187 ContainsZeros |= (M == SM_SentinelZero);
26190 // Attempt to match against byte/bit shifts.
26191 // FIXME: Add 512-bit support.
26192 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26193 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26194 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26195 MaskVT.getScalarSizeInBits(), Mask,
26196 0, Zeroable, Subtarget);
26197 if (0 < ShiftAmt) {
26198 PermuteImm = (unsigned)ShiftAmt;
26203 // Ensure we don't contain any zero elements.
26207 assert(llvm::all_of(Mask, [&](int M) {
26208 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26209 }) && "Expected unary shuffle");
26211 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26212 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26213 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26215 // Handle PSHUFLW/PSHUFHW repeated patterns.
26216 if (MaskScalarSizeInBits == 16) {
26217 SmallVector<int, 4> RepeatedMask;
26218 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26219 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26220 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26222 // PSHUFLW: permute lower 4 elements only.
26223 if (isUndefOrInRange(LoMask, 0, 4) &&
26224 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26225 Shuffle = X86ISD::PSHUFLW;
26226 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26227 PermuteImm = getV4X86ShuffleImm(LoMask);
26231 // PSHUFHW: permute upper 4 elements only.
26232 if (isUndefOrInRange(HiMask, 4, 8) &&
26233 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26234 // Offset the HiMask so that we can create the shuffle immediate.
26235 int OffsetHiMask[4];
26236 for (int i = 0; i != 4; ++i)
26237 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26239 Shuffle = X86ISD::PSHUFHW;
26240 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26241 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26250 // We only support permutation of 32/64 bit elements after this.
26251 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26254 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26255 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26256 if (FloatDomain && !Subtarget.hasAVX())
26259 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26260 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
26261 FloatDomain = true;
26263 // Check for lane crossing permutes.
26264 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26265 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26266 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26267 Shuffle = X86ISD::VPERMI;
26268 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26269 PermuteImm = getV4X86ShuffleImm(Mask);
26272 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26273 SmallVector<int, 4> RepeatedMask;
26274 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26275 Shuffle = X86ISD::VPERMI;
26276 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
26277 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
26284 // VPERMILPD can permute with a non-repeating shuffle.
26285 if (FloatDomain && MaskScalarSizeInBits == 64) {
26286 Shuffle = X86ISD::VPERMILPI;
26287 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
26289 for (int i = 0, e = Mask.size(); i != e; ++i) {
26291 if (M == SM_SentinelUndef)
26293 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
26294 PermuteImm |= (M & 1) << i;
26299 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
26300 SmallVector<int, 4> RepeatedMask;
26301 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
26304 // Narrow the repeated mask for 32-bit element permutes.
26305 SmallVector<int, 4> WordMask = RepeatedMask;
26306 if (MaskScalarSizeInBits == 64)
26307 scaleShuffleMask(2, RepeatedMask, WordMask);
26309 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
26310 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
26311 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
26312 PermuteImm = getV4X86ShuffleImm(WordMask);
26316 // Attempt to match a combined unary shuffle mask against supported binary
26317 // shuffle instructions.
26318 // TODO: Investigate sharing more of this with shuffle lowering.
26319 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26320 bool FloatDomain, SDValue &V1, SDValue &V2,
26321 const X86Subtarget &Subtarget,
26322 unsigned &Shuffle, MVT &ShuffleVT,
26324 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
26326 if (MaskVT.is128BitVector()) {
26327 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
26329 Shuffle = X86ISD::MOVLHPS;
26330 ShuffleVT = MVT::v4f32;
26333 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
26335 Shuffle = X86ISD::MOVHLPS;
26336 ShuffleVT = MVT::v4f32;
26339 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
26340 (FloatDomain || !Subtarget.hasSSE41())) {
26342 Shuffle = X86ISD::MOVSD;
26343 ShuffleVT = MaskVT;
26346 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
26347 (FloatDomain || !Subtarget.hasSSE41())) {
26348 Shuffle = X86ISD::MOVSS;
26349 ShuffleVT = MaskVT;
26354 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
26355 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26356 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26357 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
26358 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
26359 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
26360 MVT LegalVT = MaskVT;
26361 if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
26362 LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
26364 SmallVector<int, 64> Unpckl, Unpckh;
26366 createUnpackShuffleMask(MaskVT, Unpckl, true, true);
26367 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26369 Shuffle = X86ISD::UNPCKL;
26370 ShuffleVT = LegalVT;
26374 createUnpackShuffleMask(MaskVT, Unpckh, false, true);
26375 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26377 Shuffle = X86ISD::UNPCKH;
26378 ShuffleVT = LegalVT;
26382 createUnpackShuffleMask(MaskVT, Unpckl, true, false);
26383 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26384 Shuffle = X86ISD::UNPCKL;
26385 ShuffleVT = LegalVT;
26389 createUnpackShuffleMask(MaskVT, Unpckh, false, false);
26390 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26391 Shuffle = X86ISD::UNPCKH;
26392 ShuffleVT = LegalVT;
26396 ShuffleVectorSDNode::commuteMask(Unpckl);
26397 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26399 Shuffle = X86ISD::UNPCKL;
26400 ShuffleVT = LegalVT;
26404 ShuffleVectorSDNode::commuteMask(Unpckh);
26405 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26407 Shuffle = X86ISD::UNPCKH;
26408 ShuffleVT = LegalVT;
26417 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26419 SDValue &V1, SDValue &V2,
26420 SDLoc &DL, SelectionDAG &DAG,
26421 const X86Subtarget &Subtarget,
26422 unsigned &Shuffle, MVT &ShuffleVT,
26423 unsigned &PermuteImm) {
26424 unsigned NumMaskElts = Mask.size();
26426 // Attempt to match against PALIGNR byte rotate.
26427 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26428 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26429 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
26430 if (0 < ByteRotation) {
26431 Shuffle = X86ISD::PALIGNR;
26432 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
26433 PermuteImm = ByteRotation;
26438 // Attempt to combine to X86ISD::BLENDI.
26439 if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
26440 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
26441 // Determine a type compatible with X86ISD::BLENDI.
26442 // TODO - add 16i16 support (requires lane duplication).
26443 MVT BlendVT = MaskVT;
26444 if (Subtarget.hasAVX2()) {
26445 if (BlendVT == MVT::v4i64)
26446 BlendVT = MVT::v8i32;
26447 else if (BlendVT == MVT::v2i64)
26448 BlendVT = MVT::v4i32;
26450 if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
26451 BlendVT = MVT::v8i16;
26452 else if (BlendVT == MVT::v4i64)
26453 BlendVT = MVT::v4f64;
26454 else if (BlendVT == MVT::v8i32)
26455 BlendVT = MVT::v8f32;
26458 unsigned BlendSize = BlendVT.getVectorNumElements();
26459 unsigned MaskRatio = BlendSize / NumMaskElts;
26461 // Can we blend with zero?
26462 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
26464 NumMaskElts <= BlendVT.getVectorNumElements()) {
26466 for (unsigned i = 0; i != BlendSize; ++i)
26467 if (Mask[i / MaskRatio] < 0)
26468 PermuteImm |= 1u << i;
26470 V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
26471 Shuffle = X86ISD::BLENDI;
26472 ShuffleVT = BlendVT;
26476 // Attempt to match as a binary blend.
26477 if (NumMaskElts <= BlendVT.getVectorNumElements()) {
26478 bool MatchBlend = true;
26479 for (int i = 0; i != (int)NumMaskElts; ++i) {
26481 if (M == SM_SentinelUndef)
26483 else if (M == SM_SentinelZero)
26484 MatchBlend = false;
26485 else if ((M != i) && (M != (i + (int)NumMaskElts)))
26486 MatchBlend = false;
26491 for (unsigned i = 0; i != BlendSize; ++i)
26492 if ((int)NumMaskElts <= Mask[i / MaskRatio])
26493 PermuteImm |= 1u << i;
26495 Shuffle = X86ISD::BLENDI;
26496 ShuffleVT = BlendVT;
26502 // Attempt to combine to INSERTPS.
26503 if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
26504 SmallBitVector Zeroable(4, false);
26505 for (unsigned i = 0; i != NumMaskElts; ++i)
26507 Zeroable[i] = true;
26509 if (Zeroable.any() &&
26510 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
26511 Shuffle = X86ISD::INSERTPS;
26512 ShuffleVT = MVT::v4f32;
26517 // Attempt to combine to SHUFPD.
26518 if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
26519 (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
26520 (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
26521 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
26522 Shuffle = X86ISD::SHUFP;
26523 ShuffleVT = MaskVT;
26528 // Attempt to combine to SHUFPS.
26529 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26530 (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26531 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
26532 SmallVector<int, 4> RepeatedMask;
26533 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
26534 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
26535 int M0 = RepeatedMask[Offset];
26536 int M1 = RepeatedMask[Offset + 1];
26538 if (isUndefInRange(RepeatedMask, Offset, 2)) {
26539 return DAG.getUNDEF(MaskVT);
26540 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
26541 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
26542 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
26543 return getZeroVector(MaskVT, Subtarget, DAG, DL);
26544 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
26545 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26546 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26548 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
26549 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26550 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26557 int ShufMask[4] = {-1, -1, -1, -1};
26558 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
26559 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
26564 Shuffle = X86ISD::SHUFP;
26565 ShuffleVT = MaskVT;
26566 PermuteImm = getV4X86ShuffleImm(ShufMask);
26575 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
26578 /// This is the leaf of the recursive combine below. When we have found some
26579 /// chain of single-use x86 shuffle instructions and accumulated the combined
26580 /// shuffle mask represented by them, this will try to pattern match that mask
26581 /// into either a single instruction if there is a special purpose instruction
26582 /// for this operation, or into a PSHUFB instruction which is a fully general
26583 /// instruction but should only be used to replace chains over a certain depth.
26584 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
26585 ArrayRef<int> BaseMask, int Depth,
26586 bool HasVariableMask, SelectionDAG &DAG,
26587 TargetLowering::DAGCombinerInfo &DCI,
26588 const X86Subtarget &Subtarget) {
26589 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
26590 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
26591 "Unexpected number of shuffle inputs!");
26593 // Find the inputs that enter the chain. Note that multiple uses are OK
26594 // here, we're not going to remove the operands we find.
26595 bool UnaryShuffle = (Inputs.size() == 1);
26596 SDValue V1 = peekThroughBitcasts(Inputs[0]);
26597 SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
26599 MVT VT1 = V1.getSimpleValueType();
26600 MVT VT2 = V2.getSimpleValueType();
26601 MVT RootVT = Root.getSimpleValueType();
26602 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
26603 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
26604 "Vector size mismatch");
26609 unsigned NumBaseMaskElts = BaseMask.size();
26610 if (NumBaseMaskElts == 1) {
26611 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
26612 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26617 unsigned RootSizeInBits = RootVT.getSizeInBits();
26618 unsigned NumRootElts = RootVT.getVectorNumElements();
26619 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
26620 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
26621 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
26623 // Don't combine if we are a AVX512/EVEX target and the mask element size
26624 // is different from the root element size - this would prevent writemasks
26625 // from being reused.
26626 // TODO - this currently prevents all lane shuffles from occurring.
26627 // TODO - check for writemasks usage instead of always preventing combining.
26628 // TODO - attempt to narrow Mask back to writemask size.
26629 bool IsEVEXShuffle =
26630 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
26631 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
26634 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
26636 // Handle 128-bit lane shuffles of 256-bit vectors.
26637 // TODO - this should support binary shuffles.
26638 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
26639 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
26640 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
26641 return false; // Nothing to do!
26642 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26643 unsigned PermMask = 0;
26644 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
26645 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
26647 Res = DAG.getBitcast(ShuffleVT, V1);
26648 DCI.AddToWorklist(Res.getNode());
26649 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
26650 DAG.getUNDEF(ShuffleVT),
26651 DAG.getConstant(PermMask, DL, MVT::i8));
26652 DCI.AddToWorklist(Res.getNode());
26653 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26658 // For masks that have been widened to 128-bit elements or more,
26659 // narrow back down to 64-bit elements.
26660 SmallVector<int, 64> Mask;
26661 if (BaseMaskEltSizeInBits > 64) {
26662 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
26663 int MaskScale = BaseMaskEltSizeInBits / 64;
26664 scaleShuffleMask(MaskScale, BaseMask, Mask);
26666 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
26669 unsigned NumMaskElts = Mask.size();
26670 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
26672 // Determine the effective mask value type.
26673 FloatDomain &= (32 <= MaskEltSizeInBits);
26674 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
26675 : MVT::getIntegerVT(MaskEltSizeInBits);
26676 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
26678 // Only allow legal mask types.
26679 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
26682 // Attempt to match the mask against known shuffle patterns.
26683 MVT ShuffleSrcVT, ShuffleVT;
26684 unsigned Shuffle, PermuteImm;
26686 if (UnaryShuffle) {
26687 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
26688 // directly if we don't shuffle the lower element and we shuffle the upper
26689 // (zero) elements within themselves.
26690 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
26691 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
26692 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
26693 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
26694 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
26695 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
26696 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26702 if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
26703 ShuffleSrcVT, ShuffleVT)) {
26704 if (Depth == 1 && Root.getOpcode() == Shuffle)
26705 return false; // Nothing to do!
26706 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26707 return false; // AVX512 Writemask clash.
26708 Res = DAG.getBitcast(ShuffleSrcVT, V1);
26709 DCI.AddToWorklist(Res.getNode());
26710 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
26711 DCI.AddToWorklist(Res.getNode());
26712 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26717 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
26718 Shuffle, ShuffleVT, PermuteImm)) {
26719 if (Depth == 1 && Root.getOpcode() == Shuffle)
26720 return false; // Nothing to do!
26721 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26722 return false; // AVX512 Writemask clash.
26723 Res = DAG.getBitcast(ShuffleVT, V1);
26724 DCI.AddToWorklist(Res.getNode());
26725 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
26726 DAG.getConstant(PermuteImm, DL, MVT::i8));
26727 DCI.AddToWorklist(Res.getNode());
26728 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26734 if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
26735 Shuffle, ShuffleVT, UnaryShuffle)) {
26736 if (Depth == 1 && Root.getOpcode() == Shuffle)
26737 return false; // Nothing to do!
26738 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26739 return false; // AVX512 Writemask clash.
26740 V1 = DAG.getBitcast(ShuffleVT, V1);
26741 DCI.AddToWorklist(V1.getNode());
26742 V2 = DAG.getBitcast(ShuffleVT, V2);
26743 DCI.AddToWorklist(V2.getNode());
26744 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
26745 DCI.AddToWorklist(Res.getNode());
26746 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26751 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
26752 DAG, Subtarget, Shuffle, ShuffleVT,
26754 if (Depth == 1 && Root.getOpcode() == Shuffle)
26755 return false; // Nothing to do!
26756 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26757 return false; // AVX512 Writemask clash.
26758 V1 = DAG.getBitcast(ShuffleVT, V1);
26759 DCI.AddToWorklist(V1.getNode());
26760 V2 = DAG.getBitcast(ShuffleVT, V2);
26761 DCI.AddToWorklist(V2.getNode());
26762 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
26763 DAG.getConstant(PermuteImm, DL, MVT::i8));
26764 DCI.AddToWorklist(Res.getNode());
26765 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26770 // Don't try to re-form single instruction chains under any circumstances now
26771 // that we've done encoding canonicalization for them.
26775 bool MaskContainsZeros =
26776 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
26778 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
26779 // If we have a single input lane-crossing shuffle then lower to VPERMV.
26780 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26781 ((Subtarget.hasAVX2() &&
26782 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26783 (Subtarget.hasAVX512() &&
26784 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26785 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26786 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26787 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26788 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26789 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26790 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26791 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26792 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26793 DCI.AddToWorklist(VPermMask.getNode());
26794 Res = DAG.getBitcast(MaskVT, V1);
26795 DCI.AddToWorklist(Res.getNode());
26796 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
26797 DCI.AddToWorklist(Res.getNode());
26798 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26803 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
26804 // vector as the second source.
26805 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26806 ((Subtarget.hasAVX512() &&
26807 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26808 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26809 (Subtarget.hasVLX() &&
26810 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26811 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26812 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26813 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26814 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26815 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26816 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
26817 for (unsigned i = 0; i != NumMaskElts; ++i)
26818 if (Mask[i] == SM_SentinelZero)
26819 Mask[i] = NumMaskElts + i;
26821 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26822 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26823 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26824 DCI.AddToWorklist(VPermMask.getNode());
26825 Res = DAG.getBitcast(MaskVT, V1);
26826 DCI.AddToWorklist(Res.getNode());
26827 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
26828 DCI.AddToWorklist(Zero.getNode());
26829 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
26830 DCI.AddToWorklist(Res.getNode());
26831 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26836 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
26837 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26838 ((Subtarget.hasAVX512() &&
26839 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26840 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26841 (Subtarget.hasVLX() &&
26842 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26843 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26844 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26845 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26846 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26847 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26848 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26849 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26850 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26851 DCI.AddToWorklist(VPermMask.getNode());
26852 V1 = DAG.getBitcast(MaskVT, V1);
26853 DCI.AddToWorklist(V1.getNode());
26854 V2 = DAG.getBitcast(MaskVT, V2);
26855 DCI.AddToWorklist(V2.getNode());
26856 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
26857 DCI.AddToWorklist(Res.getNode());
26858 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26865 // See if we can combine a single input shuffle with zeros to a bit-mask,
26866 // which is much simpler than any shuffle.
26867 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
26868 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
26869 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
26870 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
26871 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
26872 SmallBitVector UndefElts(NumMaskElts, false);
26873 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
26874 for (unsigned i = 0; i != NumMaskElts; ++i) {
26876 if (M == SM_SentinelUndef) {
26877 UndefElts[i] = true;
26880 if (M == SM_SentinelZero)
26882 EltBits[i] = AllOnes;
26884 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
26885 DCI.AddToWorklist(BitMask.getNode());
26886 Res = DAG.getBitcast(MaskVT, V1);
26887 DCI.AddToWorklist(Res.getNode());
26888 unsigned AndOpcode =
26889 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
26890 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
26891 DCI.AddToWorklist(Res.getNode());
26892 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26897 // If we have a single input shuffle with different shuffle patterns in the
26898 // the 128-bit lanes use the variable mask to VPERMILPS.
26899 // TODO Combine other mask types at higher depths.
26900 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
26901 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26902 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
26903 SmallVector<SDValue, 16> VPermIdx;
26904 for (int M : Mask) {
26906 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
26907 VPermIdx.push_back(Idx);
26909 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
26910 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
26911 DCI.AddToWorklist(VPermMask.getNode());
26912 Res = DAG.getBitcast(MaskVT, V1);
26913 DCI.AddToWorklist(Res.getNode());
26914 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
26915 DCI.AddToWorklist(Res.getNode());
26916 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26921 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
26922 // to VPERMIL2PD/VPERMIL2PS.
26923 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
26924 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
26925 MaskVT == MVT::v8f32)) {
26926 // VPERMIL2 Operation.
26927 // Bits[3] - Match Bit.
26928 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
26929 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
26930 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
26931 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
26932 SmallVector<int, 8> VPerm2Idx;
26933 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
26934 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
26935 unsigned M2ZImm = 0;
26936 for (int M : Mask) {
26937 if (M == SM_SentinelUndef) {
26938 VPerm2Idx.push_back(-1);
26941 if (M == SM_SentinelZero) {
26943 VPerm2Idx.push_back(8);
26946 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
26947 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
26948 VPerm2Idx.push_back(Index);
26950 V1 = DAG.getBitcast(MaskVT, V1);
26951 DCI.AddToWorklist(V1.getNode());
26952 V2 = DAG.getBitcast(MaskVT, V2);
26953 DCI.AddToWorklist(V2.getNode());
26954 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
26955 DCI.AddToWorklist(VPerm2MaskOp.getNode());
26956 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
26957 DAG.getConstant(M2ZImm, DL, MVT::i8));
26958 DCI.AddToWorklist(Res.getNode());
26959 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26964 // If we have 3 or more shuffle instructions or a chain involving a variable
26965 // mask, we can replace them with a single PSHUFB instruction profitably.
26966 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
26967 // instructions, but in practice PSHUFB tends to be *very* fast so we're
26968 // more aggressive.
26969 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26970 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26971 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
26972 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
26973 SmallVector<SDValue, 16> PSHUFBMask;
26974 int NumBytes = RootVT.getSizeInBits() / 8;
26975 int Ratio = NumBytes / NumMaskElts;
26976 for (int i = 0; i < NumBytes; ++i) {
26977 int M = Mask[i / Ratio];
26978 if (M == SM_SentinelUndef) {
26979 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
26982 if (M == SM_SentinelZero) {
26983 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
26986 M = Ratio * M + i % Ratio;
26987 assert ((M / 16) == (i / 16) && "Lane crossing detected");
26988 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
26990 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
26991 Res = DAG.getBitcast(ByteVT, V1);
26992 DCI.AddToWorklist(Res.getNode());
26993 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
26994 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
26995 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
26996 DCI.AddToWorklist(Res.getNode());
26997 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27002 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27003 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27004 // slower than PSHUFB on targets that support both.
27005 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27006 Subtarget.hasXOP()) {
27007 // VPPERM Mask Operation
27008 // Bits[4:0] - Byte Index (0 - 31)
27009 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27010 SmallVector<SDValue, 16> VPPERMMask;
27012 int Ratio = NumBytes / NumMaskElts;
27013 for (int i = 0; i < NumBytes; ++i) {
27014 int M = Mask[i / Ratio];
27015 if (M == SM_SentinelUndef) {
27016 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27019 if (M == SM_SentinelZero) {
27020 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27023 M = Ratio * M + i % Ratio;
27024 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27026 MVT ByteVT = MVT::v16i8;
27027 V1 = DAG.getBitcast(ByteVT, V1);
27028 DCI.AddToWorklist(V1.getNode());
27029 V2 = DAG.getBitcast(ByteVT, V2);
27030 DCI.AddToWorklist(V2.getNode());
27031 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27032 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27033 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27034 DCI.AddToWorklist(Res.getNode());
27035 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27040 // Failed to find any combines.
27044 // Attempt to constant fold all of the constant source ops.
27045 // Returns true if the entire shuffle is folded to a constant.
27046 // TODO: Extend this to merge multiple constant Ops and update the mask.
27047 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27048 ArrayRef<int> Mask, SDValue Root,
27049 bool HasVariableMask, SelectionDAG &DAG,
27050 TargetLowering::DAGCombinerInfo &DCI,
27051 const X86Subtarget &Subtarget) {
27052 MVT VT = Root.getSimpleValueType();
27054 unsigned SizeInBits = VT.getSizeInBits();
27055 unsigned NumMaskElts = Mask.size();
27056 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27057 unsigned NumOps = Ops.size();
27059 // Extract constant bits from each source op.
27060 bool OneUseConstantOp = false;
27061 SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
27062 SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
27063 for (unsigned i = 0; i != NumOps; ++i) {
27064 SDValue SrcOp = Ops[i];
27065 OneUseConstantOp |= SrcOp.hasOneUse();
27066 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27071 // Only fold if at least one of the constants is only used once or
27072 // the combined shuffle has included a variable mask shuffle, this
27073 // is to avoid constant pool bloat.
27074 if (!OneUseConstantOp && !HasVariableMask)
27077 // Shuffle the constant bits according to the mask.
27078 SmallBitVector UndefElts(NumMaskElts, false);
27079 SmallBitVector ZeroElts(NumMaskElts, false);
27080 SmallBitVector ConstantElts(NumMaskElts, false);
27081 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27082 APInt::getNullValue(MaskSizeInBits));
27083 for (unsigned i = 0; i != NumMaskElts; ++i) {
27085 if (M == SM_SentinelUndef) {
27086 UndefElts[i] = true;
27088 } else if (M == SM_SentinelZero) {
27089 ZeroElts[i] = true;
27092 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27094 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27095 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27097 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27098 if (SrcUndefElts[SrcMaskIdx]) {
27099 UndefElts[i] = true;
27103 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27104 APInt &Bits = SrcEltBits[SrcMaskIdx];
27106 ZeroElts[i] = true;
27110 ConstantElts[i] = true;
27111 ConstantBitData[i] = Bits;
27113 assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
27115 // Create the constant data.
27117 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27118 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27120 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27122 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27125 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27126 DCI.AddToWorklist(CstOp.getNode());
27127 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27131 /// \brief Fully generic combining of x86 shuffle instructions.
27133 /// This should be the last combine run over the x86 shuffle instructions. Once
27134 /// they have been fully optimized, this will recursively consider all chains
27135 /// of single-use shuffle instructions, build a generic model of the cumulative
27136 /// shuffle operation, and check for simpler instructions which implement this
27137 /// operation. We use this primarily for two purposes:
27139 /// 1) Collapse generic shuffles to specialized single instructions when
27140 /// equivalent. In most cases, this is just an encoding size win, but
27141 /// sometimes we will collapse multiple generic shuffles into a single
27142 /// special-purpose shuffle.
27143 /// 2) Look for sequences of shuffle instructions with 3 or more total
27144 /// instructions, and replace them with the slightly more expensive SSSE3
27145 /// PSHUFB instruction if available. We do this as the last combining step
27146 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27147 /// a suitable short sequence of other instructions. The PSHUFB will either
27148 /// use a register or have to read from memory and so is slightly (but only
27149 /// slightly) more expensive than the other shuffle instructions.
27151 /// Because this is inherently a quadratic operation (for each shuffle in
27152 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27153 /// This should never be an issue in practice as the shuffle lowering doesn't
27154 /// produce sequences of more than 8 instructions.
27156 /// FIXME: We will currently miss some cases where the redundant shuffling
27157 /// would simplify under the threshold for PSHUFB formation because of
27158 /// combine-ordering. To fix this, we should do the redundant instruction
27159 /// combining in this recursive walk.
27160 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27161 int SrcOpIndex, SDValue Root,
27162 ArrayRef<int> RootMask,
27163 int Depth, bool HasVariableMask,
27165 TargetLowering::DAGCombinerInfo &DCI,
27166 const X86Subtarget &Subtarget) {
27167 // Bound the depth of our recursive combine because this is ultimately
27168 // quadratic in nature.
27172 // Directly rip through bitcasts to find the underlying operand.
27173 SDValue Op = SrcOps[SrcOpIndex];
27174 Op = peekThroughOneUseBitcasts(Op);
27176 MVT VT = Op.getSimpleValueType();
27177 if (!VT.isVector())
27178 return false; // Bail if we hit a non-vector.
27180 assert(Root.getSimpleValueType().isVector() &&
27181 "Shuffles operate on vector types!");
27182 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27183 "Can only combine shuffles of the same vector register size.");
27185 // Extract target shuffle mask and resolve sentinels and inputs.
27186 SDValue Input0, Input1;
27187 SmallVector<int, 16> OpMask;
27188 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
27191 // Add the inputs to the Ops list, avoiding duplicates.
27192 SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
27194 int InputIdx0 = -1, InputIdx1 = -1;
27195 for (int i = 0, e = Ops.size(); i < e; ++i) {
27196 SDValue BC = peekThroughBitcasts(Ops[i]);
27197 if (Input0 && BC == peekThroughBitcasts(Input0))
27199 if (Input1 && BC == peekThroughBitcasts(Input1))
27203 if (Input0 && InputIdx0 < 0) {
27204 InputIdx0 = SrcOpIndex;
27205 Ops[SrcOpIndex] = Input0;
27207 if (Input1 && InputIdx1 < 0) {
27208 InputIdx1 = Ops.size();
27209 Ops.push_back(Input1);
27212 assert(((RootMask.size() > OpMask.size() &&
27213 RootMask.size() % OpMask.size() == 0) ||
27214 (OpMask.size() > RootMask.size() &&
27215 OpMask.size() % RootMask.size() == 0) ||
27216 OpMask.size() == RootMask.size()) &&
27217 "The smaller number of elements must divide the larger.");
27218 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27219 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27220 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27221 assert(((RootRatio == 1 && OpRatio == 1) ||
27222 (RootRatio == 1) != (OpRatio == 1)) &&
27223 "Must not have a ratio for both incoming and op masks!");
27225 SmallVector<int, 16> Mask;
27226 Mask.reserve(MaskWidth);
27228 // Merge this shuffle operation's mask into our accumulated mask. Note that
27229 // this shuffle's mask will be the first applied to the input, followed by the
27230 // root mask to get us all the way to the root value arrangement. The reason
27231 // for this order is that we are recursing up the operation chain.
27232 for (int i = 0; i < MaskWidth; ++i) {
27233 int RootIdx = i / RootRatio;
27234 if (RootMask[RootIdx] < 0) {
27235 // This is a zero or undef lane, we're done.
27236 Mask.push_back(RootMask[RootIdx]);
27240 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27242 // Just insert the scaled root mask value if it references an input other
27243 // than the SrcOp we're currently inserting.
27244 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27245 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27246 Mask.push_back(RootMaskedIdx);
27250 RootMaskedIdx %= MaskWidth;
27252 int OpIdx = RootMaskedIdx / OpRatio;
27253 if (OpMask[OpIdx] < 0) {
27254 // The incoming lanes are zero or undef, it doesn't matter which ones we
27256 Mask.push_back(OpMask[OpIdx]);
27260 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27261 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27262 OpMaskedIdx %= MaskWidth;
27264 if (OpMask[OpIdx] < (int)OpMask.size()) {
27265 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27266 OpMaskedIdx += InputIdx0 * MaskWidth;
27268 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27269 OpMaskedIdx += InputIdx1 * MaskWidth;
27272 Mask.push_back(OpMaskedIdx);
27275 // Handle the all undef/zero cases early.
27276 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27277 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27280 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27281 // TODO - should we handle the mixed zero/undef case as well? Just returning
27282 // a zero mask will lose information on undef elements possibly reducing
27283 // future combine possibilities.
27284 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27285 Subtarget, DAG, SDLoc(Root)));
27289 // Remove unused shuffle source ops.
27290 SmallVector<SDValue, 8> UsedOps;
27291 for (int i = 0, e = Ops.size(); i < e; ++i) {
27292 int lo = UsedOps.size() * MaskWidth;
27293 int hi = lo + MaskWidth;
27294 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
27295 UsedOps.push_back(Ops[i]);
27298 for (int &M : Mask)
27302 assert(!UsedOps.empty() && "Shuffle with no inputs detected");
27305 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27307 // See if we can recurse into each shuffle source op (if it's a target shuffle).
27308 for (int i = 0, e = Ops.size(); i < e; ++i)
27309 if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
27310 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
27311 HasVariableMask, DAG, DCI, Subtarget))
27314 // Attempt to constant fold all of the constant source ops.
27315 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
27319 // We can only combine unary and binary shuffle mask cases.
27320 if (Ops.size() > 2)
27323 // Minor canonicalization of the accumulated shuffle mask to make it easier
27324 // to match below. All this does is detect masks with sequential pairs of
27325 // elements, and shrink them to the half-width mask. It does this in a loop
27326 // so it will reduce the size of the mask to the minimal width mask which
27327 // performs an equivalent shuffle.
27328 SmallVector<int, 16> WidenedMask;
27329 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
27330 Mask = std::move(WidenedMask);
27333 // Canonicalization of binary shuffle masks to improve pattern matching by
27334 // commuting the inputs.
27335 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
27336 ShuffleVectorSDNode::commuteMask(Mask);
27337 std::swap(Ops[0], Ops[1]);
27340 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
27344 /// \brief Get the PSHUF-style mask from PSHUF node.
27346 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
27347 /// PSHUF-style masks that can be reused with such instructions.
27348 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
27349 MVT VT = N.getSimpleValueType();
27350 SmallVector<int, 4> Mask;
27351 SmallVector<SDValue, 2> Ops;
27354 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
27358 // If we have more than 128-bits, only the low 128-bits of shuffle mask
27359 // matter. Check that the upper masks are repeats and remove them.
27360 if (VT.getSizeInBits() > 128) {
27361 int LaneElts = 128 / VT.getScalarSizeInBits();
27363 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
27364 for (int j = 0; j < LaneElts; ++j)
27365 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
27366 "Mask doesn't repeat in high 128-bit lanes!");
27368 Mask.resize(LaneElts);
27371 switch (N.getOpcode()) {
27372 case X86ISD::PSHUFD:
27374 case X86ISD::PSHUFLW:
27377 case X86ISD::PSHUFHW:
27378 Mask.erase(Mask.begin(), Mask.begin() + 4);
27379 for (int &M : Mask)
27383 llvm_unreachable("No valid shuffle instruction found!");
27387 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
27389 /// We walk up the chain and look for a combinable shuffle, skipping over
27390 /// shuffles that we could hoist this shuffle's transformation past without
27391 /// altering anything.
27393 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
27395 TargetLowering::DAGCombinerInfo &DCI) {
27396 assert(N.getOpcode() == X86ISD::PSHUFD &&
27397 "Called with something other than an x86 128-bit half shuffle!");
27400 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
27401 // of the shuffles in the chain so that we can form a fresh chain to replace
27403 SmallVector<SDValue, 8> Chain;
27404 SDValue V = N.getOperand(0);
27405 for (; V.hasOneUse(); V = V.getOperand(0)) {
27406 switch (V.getOpcode()) {
27408 return SDValue(); // Nothing combined!
27411 // Skip bitcasts as we always know the type for the target specific
27415 case X86ISD::PSHUFD:
27416 // Found another dword shuffle.
27419 case X86ISD::PSHUFLW:
27420 // Check that the low words (being shuffled) are the identity in the
27421 // dword shuffle, and the high words are self-contained.
27422 if (Mask[0] != 0 || Mask[1] != 1 ||
27423 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
27426 Chain.push_back(V);
27429 case X86ISD::PSHUFHW:
27430 // Check that the high words (being shuffled) are the identity in the
27431 // dword shuffle, and the low words are self-contained.
27432 if (Mask[2] != 2 || Mask[3] != 3 ||
27433 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
27436 Chain.push_back(V);
27439 case X86ISD::UNPCKL:
27440 case X86ISD::UNPCKH:
27441 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
27442 // shuffle into a preceding word shuffle.
27443 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
27444 V.getSimpleValueType().getVectorElementType() != MVT::i16)
27447 // Search for a half-shuffle which we can combine with.
27448 unsigned CombineOp =
27449 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
27450 if (V.getOperand(0) != V.getOperand(1) ||
27451 !V->isOnlyUserOf(V.getOperand(0).getNode()))
27453 Chain.push_back(V);
27454 V = V.getOperand(0);
27456 switch (V.getOpcode()) {
27458 return SDValue(); // Nothing to combine.
27460 case X86ISD::PSHUFLW:
27461 case X86ISD::PSHUFHW:
27462 if (V.getOpcode() == CombineOp)
27465 Chain.push_back(V);
27469 V = V.getOperand(0);
27473 } while (V.hasOneUse());
27476 // Break out of the loop if we break out of the switch.
27480 if (!V.hasOneUse())
27481 // We fell out of the loop without finding a viable combining instruction.
27484 // Merge this node's mask and our incoming mask.
27485 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27486 for (int &M : Mask)
27488 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
27489 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27491 // Rebuild the chain around this new shuffle.
27492 while (!Chain.empty()) {
27493 SDValue W = Chain.pop_back_val();
27495 if (V.getValueType() != W.getOperand(0).getValueType())
27496 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
27498 switch (W.getOpcode()) {
27500 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
27502 case X86ISD::UNPCKL:
27503 case X86ISD::UNPCKH:
27504 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
27507 case X86ISD::PSHUFD:
27508 case X86ISD::PSHUFLW:
27509 case X86ISD::PSHUFHW:
27510 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
27514 if (V.getValueType() != N.getValueType())
27515 V = DAG.getBitcast(N.getValueType(), V);
27517 // Return the new chain to replace N.
27521 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
27524 /// We walk up the chain, skipping shuffles of the other half and looking
27525 /// through shuffles which switch halves trying to find a shuffle of the same
27526 /// pair of dwords.
27527 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
27529 TargetLowering::DAGCombinerInfo &DCI) {
27531 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
27532 "Called with something other than an x86 128-bit half shuffle!");
27534 unsigned CombineOpcode = N.getOpcode();
27536 // Walk up a single-use chain looking for a combinable shuffle.
27537 SDValue V = N.getOperand(0);
27538 for (; V.hasOneUse(); V = V.getOperand(0)) {
27539 switch (V.getOpcode()) {
27541 return false; // Nothing combined!
27544 // Skip bitcasts as we always know the type for the target specific
27548 case X86ISD::PSHUFLW:
27549 case X86ISD::PSHUFHW:
27550 if (V.getOpcode() == CombineOpcode)
27553 // Other-half shuffles are no-ops.
27556 // Break out of the loop if we break out of the switch.
27560 if (!V.hasOneUse())
27561 // We fell out of the loop without finding a viable combining instruction.
27564 // Combine away the bottom node as its shuffle will be accumulated into
27565 // a preceding shuffle.
27566 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27568 // Record the old value.
27571 // Merge this node's mask and our incoming mask (adjusted to account for all
27572 // the pshufd instructions encountered).
27573 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27574 for (int &M : Mask)
27576 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
27577 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27579 // Check that the shuffles didn't cancel each other out. If not, we need to
27580 // combine to the new one.
27582 // Replace the combinable shuffle with the combined one, updating all users
27583 // so that we re-evaluate the chain here.
27584 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
27589 /// \brief Try to combine x86 target specific shuffles.
27590 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
27591 TargetLowering::DAGCombinerInfo &DCI,
27592 const X86Subtarget &Subtarget) {
27594 MVT VT = N.getSimpleValueType();
27595 SmallVector<int, 4> Mask;
27597 unsigned Opcode = N.getOpcode();
27599 case X86ISD::PSHUFD:
27600 case X86ISD::PSHUFLW:
27601 case X86ISD::PSHUFHW:
27602 Mask = getPSHUFShuffleMask(N);
27603 assert(Mask.size() == 4);
27605 case X86ISD::UNPCKL: {
27606 auto Op0 = N.getOperand(0);
27607 auto Op1 = N.getOperand(1);
27608 unsigned Opcode0 = Op0.getOpcode();
27609 unsigned Opcode1 = Op1.getOpcode();
27611 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
27612 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
27613 // TODO: Add other horizontal operations as required.
27614 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
27615 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
27617 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
27618 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
27619 // moves upper half elements into the lower half part. For example:
27621 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
27623 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
27625 // will be combined to:
27627 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
27629 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
27630 // happen due to advanced instructions.
27631 if (!VT.is128BitVector())
27634 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
27635 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
27637 unsigned NumElts = VT.getVectorNumElements();
27638 SmallVector<int, 8> ExpectedMask(NumElts, -1);
27639 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
27642 auto ShufOp = Op1.getOperand(0);
27643 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
27644 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
27648 case X86ISD::BLENDI: {
27649 SDValue V0 = N->getOperand(0);
27650 SDValue V1 = N->getOperand(1);
27651 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
27652 "Unexpected input vector types");
27654 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
27655 // operands and changing the mask to 1. This saves us a bunch of
27656 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
27657 // x86InstrInfo knows how to commute this back after instruction selection
27658 // if it would help register allocation.
27660 // TODO: If optimizing for size or a processor that doesn't suffer from
27661 // partial register update stalls, this should be transformed into a MOVSD
27662 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
27664 if (VT == MVT::v2f64)
27665 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
27666 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
27667 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
27668 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
27673 case X86ISD::MOVSD:
27674 case X86ISD::MOVSS: {
27675 bool isFloat = VT.isFloatingPoint();
27676 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
27677 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
27678 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
27679 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
27680 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
27681 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
27682 assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
27684 // We often lower to MOVSD/MOVSS from integer as well as native float
27685 // types; remove unnecessary domain-crossing bitcasts if we can to make it
27686 // easier to combine shuffles later on. We've already accounted for the
27687 // domain switching cost when we decided to lower with it.
27688 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
27689 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
27690 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
27691 V0 = DAG.getBitcast(NewVT, V0);
27692 V1 = DAG.getBitcast(NewVT, V1);
27693 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
27698 case X86ISD::INSERTPS: {
27699 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
27700 SDValue Op0 = N.getOperand(0);
27701 SDValue Op1 = N.getOperand(1);
27702 SDValue Op2 = N.getOperand(2);
27703 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
27704 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
27705 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
27706 unsigned ZeroMask = InsertPSMask & 0xF;
27708 // If we zero out all elements from Op0 then we don't need to reference it.
27709 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
27710 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
27711 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27713 // If we zero out the element from Op1 then we don't need to reference it.
27714 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
27715 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27716 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27718 // Attempt to merge insertps Op1 with an inner target shuffle node.
27719 SmallVector<int, 8> TargetMask1;
27720 SmallVector<SDValue, 2> Ops1;
27721 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
27722 int M = TargetMask1[SrcIdx];
27723 if (isUndefOrZero(M)) {
27724 // Zero/UNDEF insertion - zero out element and remove dependency.
27725 InsertPSMask |= (1u << DstIdx);
27726 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27727 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27729 // Update insertps mask srcidx and reference the source input directly.
27730 assert(0 <= M && M < 8 && "Shuffle index out of range");
27731 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
27732 Op1 = Ops1[M < 4 ? 0 : 1];
27733 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27734 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27737 // Attempt to merge insertps Op0 with an inner target shuffle node.
27738 SmallVector<int, 8> TargetMask0;
27739 SmallVector<SDValue, 2> Ops0;
27740 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
27743 bool Updated = false;
27744 bool UseInput00 = false;
27745 bool UseInput01 = false;
27746 for (int i = 0; i != 4; ++i) {
27747 int M = TargetMask0[i];
27748 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
27749 // No change if element is already zero or the inserted element.
27751 } else if (isUndefOrZero(M)) {
27752 // If the target mask is undef/zero then we must zero the element.
27753 InsertPSMask |= (1u << i);
27758 // The input vector element must be inline.
27759 if (M != i && M != (i + 4))
27762 // Determine which inputs of the target shuffle we're using.
27763 UseInput00 |= (0 <= M && M < 4);
27764 UseInput01 |= (4 <= M);
27767 // If we're not using both inputs of the target shuffle then use the
27768 // referenced input directly.
27769 if (UseInput00 && !UseInput01) {
27772 } else if (!UseInput00 && UseInput01) {
27778 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27779 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27787 // Nuke no-op shuffles that show up after combining.
27788 if (isNoopShuffleMask(Mask))
27789 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27791 // Look for simplifications involving one or two shuffle instructions.
27792 SDValue V = N.getOperand(0);
27793 switch (N.getOpcode()) {
27796 case X86ISD::PSHUFLW:
27797 case X86ISD::PSHUFHW:
27798 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
27800 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
27801 return SDValue(); // We combined away this shuffle, so we're done.
27803 // See if this reduces to a PSHUFD which is no more expensive and can
27804 // combine with more operations. Note that it has to at least flip the
27805 // dwords as otherwise it would have been removed as a no-op.
27806 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
27807 int DMask[] = {0, 1, 2, 3};
27808 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
27809 DMask[DOffset + 0] = DOffset + 1;
27810 DMask[DOffset + 1] = DOffset + 0;
27811 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27812 V = DAG.getBitcast(DVT, V);
27813 DCI.AddToWorklist(V.getNode());
27814 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
27815 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
27816 DCI.AddToWorklist(V.getNode());
27817 return DAG.getBitcast(VT, V);
27820 // Look for shuffle patterns which can be implemented as a single unpack.
27821 // FIXME: This doesn't handle the location of the PSHUFD generically, and
27822 // only works when we have a PSHUFD followed by two half-shuffles.
27823 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
27824 (V.getOpcode() == X86ISD::PSHUFLW ||
27825 V.getOpcode() == X86ISD::PSHUFHW) &&
27826 V.getOpcode() != N.getOpcode() &&
27828 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
27829 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
27830 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27831 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
27832 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27833 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27835 for (int i = 0; i < 4; ++i) {
27836 WordMask[i + NOffset] = Mask[i] + NOffset;
27837 WordMask[i + VOffset] = VMask[i] + VOffset;
27839 // Map the word mask through the DWord mask.
27841 for (int i = 0; i < 8; ++i)
27842 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
27843 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
27844 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
27845 // We can replace all three shuffles with an unpack.
27846 V = DAG.getBitcast(VT, D.getOperand(0));
27847 DCI.AddToWorklist(V.getNode());
27848 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
27857 case X86ISD::PSHUFD:
27858 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
27867 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
27868 /// operation. If true is returned then the operands of ADDSUB operation
27869 /// are written to the parameters \p Opnd0 and \p Opnd1.
27871 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
27872 /// so it is easier to generically match. We also insert dummy vector shuffle
27873 /// nodes for the operands which explicitly discard the lanes which are unused
27874 /// by this operation to try to flow through the rest of the combiner
27875 /// the fact that they're unused.
27876 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
27877 SDValue &Opnd0, SDValue &Opnd1) {
27879 EVT VT = N->getValueType(0);
27880 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
27881 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
27882 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
27885 // We only handle target-independent shuffles.
27886 // FIXME: It would be easy and harmless to use the target shuffle mask
27887 // extraction tool to support more.
27888 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
27891 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
27892 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
27894 SDValue V1 = N->getOperand(0);
27895 SDValue V2 = N->getOperand(1);
27897 // We require the first shuffle operand to be the FSUB node, and the second to
27898 // be the FADD node.
27899 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
27900 ShuffleVectorSDNode::commuteMask(Mask);
27902 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
27905 // If there are other uses of these operations we can't fold them.
27906 if (!V1->hasOneUse() || !V2->hasOneUse())
27909 // Ensure that both operations have the same operands. Note that we can
27910 // commute the FADD operands.
27911 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
27912 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
27913 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
27916 // We're looking for blends between FADD and FSUB nodes. We insist on these
27917 // nodes being lined up in a specific expected pattern.
27918 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
27919 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
27920 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
27921 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
27922 8, 25, 10, 27, 12, 29, 14, 31})))
27930 /// \brief Try to combine a shuffle into a target-specific add-sub or
27931 /// mul-add-sub node.
27932 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
27933 const X86Subtarget &Subtarget,
27934 SelectionDAG &DAG) {
27935 SDValue Opnd0, Opnd1;
27936 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
27939 EVT VT = N->getValueType(0);
27942 // Try to generate X86ISD::FMADDSUB node here.
27944 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
27945 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
27947 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
27948 // the ADDSUB idiom has been successfully recognized. There are no known
27949 // X86 targets with 512-bit ADDSUB instructions!
27950 if (VT.is512BitVector())
27953 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
27956 // We are looking for a shuffle where both sources are concatenated with undef
27957 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
27958 // if we can express this as a single-source shuffle, that's preferable.
27959 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
27960 const X86Subtarget &Subtarget) {
27961 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
27964 EVT VT = N->getValueType(0);
27966 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
27967 if (!VT.is128BitVector() && !VT.is256BitVector())
27970 if (VT.getVectorElementType() != MVT::i32 &&
27971 VT.getVectorElementType() != MVT::i64 &&
27972 VT.getVectorElementType() != MVT::f32 &&
27973 VT.getVectorElementType() != MVT::f64)
27976 SDValue N0 = N->getOperand(0);
27977 SDValue N1 = N->getOperand(1);
27979 // Check that both sources are concats with undef.
27980 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
27981 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
27982 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
27983 !N1.getOperand(1).isUndef())
27986 // Construct the new shuffle mask. Elements from the first source retain their
27987 // index, but elements from the second source no longer need to skip an undef.
27988 SmallVector<int, 8> Mask;
27989 int NumElts = VT.getVectorNumElements();
27991 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
27992 for (int Elt : SVOp->getMask())
27993 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
27996 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
27998 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28001 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28002 TargetLowering::DAGCombinerInfo &DCI,
28003 const X86Subtarget &Subtarget) {
28005 EVT VT = N->getValueType(0);
28007 // Don't create instructions with illegal types after legalize types has run.
28008 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28009 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
28012 // If we have legalized the vector types, look for blends of FADD and FSUB
28013 // nodes that we can fuse into an ADDSUB node.
28014 if (TLI.isTypeLegal(VT))
28015 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28018 // During Type Legalization, when promoting illegal vector types,
28019 // the backend might introduce new shuffle dag nodes and bitcasts.
28021 // This code performs the following transformation:
28022 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28023 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28025 // We do this only if both the bitcast and the BINOP dag nodes have
28026 // one use. Also, perform this transformation only if the new binary
28027 // operation is legal. This is to avoid introducing dag nodes that
28028 // potentially need to be further expanded (or custom lowered) into a
28029 // less optimal sequence of dag nodes.
28030 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28031 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28032 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28033 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28034 SDValue N0 = N->getOperand(0);
28035 SDValue N1 = N->getOperand(1);
28037 SDValue BC0 = N0.getOperand(0);
28038 EVT SVT = BC0.getValueType();
28039 unsigned Opcode = BC0.getOpcode();
28040 unsigned NumElts = VT.getVectorNumElements();
28042 if (BC0.hasOneUse() && SVT.isVector() &&
28043 SVT.getVectorNumElements() * 2 == NumElts &&
28044 TLI.isOperationLegal(Opcode, VT)) {
28045 bool CanFold = false;
28051 // isOperationLegal lies for integer ops on floating point types.
28052 CanFold = VT.isInteger();
28057 // isOperationLegal lies for floating point ops on integer types.
28058 CanFold = VT.isFloatingPoint();
28062 unsigned SVTNumElts = SVT.getVectorNumElements();
28063 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28064 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28065 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28066 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28067 CanFold = SVOp->getMaskElt(i) < 0;
28070 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28071 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28072 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28073 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28078 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28079 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28080 // consecutive, non-overlapping, and in the right order.
28081 SmallVector<SDValue, 16> Elts;
28082 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
28083 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
28085 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28088 // For AVX2, we sometimes want to combine
28089 // (vector_shuffle <mask> (concat_vectors t1, undef)
28090 // (concat_vectors t2, undef))
28092 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28093 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28094 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28097 if (isTargetShuffle(N->getOpcode())) {
28099 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28102 // Try recursively combining arbitrary sequences of x86 shuffle
28103 // instructions into higher-order shuffles. We do this after combining
28104 // specific PSHUF instruction sequences into their minimal form so that we
28105 // can evaluate how many specialized shuffle instructions are involved in
28106 // a particular chain.
28107 SmallVector<int, 1> NonceMask; // Just a placeholder.
28108 NonceMask.push_back(0);
28109 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
28110 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28112 return SDValue(); // This routine will use CombineTo to replace N.
28118 /// Check if a vector extract from a target-specific shuffle of a load can be
28119 /// folded into a single element load.
28120 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28121 /// shuffles have been custom lowered so we need to handle those here.
28122 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28123 TargetLowering::DAGCombinerInfo &DCI) {
28124 if (DCI.isBeforeLegalizeOps())
28127 SDValue InVec = N->getOperand(0);
28128 SDValue EltNo = N->getOperand(1);
28129 EVT EltVT = N->getValueType(0);
28131 if (!isa<ConstantSDNode>(EltNo))
28134 EVT OriginalVT = InVec.getValueType();
28136 if (InVec.getOpcode() == ISD::BITCAST) {
28137 // Don't duplicate a load with other uses.
28138 if (!InVec.hasOneUse())
28140 EVT BCVT = InVec.getOperand(0).getValueType();
28141 if (!BCVT.isVector() ||
28142 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28144 InVec = InVec.getOperand(0);
28147 EVT CurrentVT = InVec.getValueType();
28149 if (!isTargetShuffle(InVec.getOpcode()))
28152 // Don't duplicate a load with other uses.
28153 if (!InVec.hasOneUse())
28156 SmallVector<int, 16> ShuffleMask;
28157 SmallVector<SDValue, 2> ShuffleOps;
28159 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28160 ShuffleOps, ShuffleMask, UnaryShuffle))
28163 // Select the input vector, guarding against out of range extract vector.
28164 unsigned NumElems = CurrentVT.getVectorNumElements();
28165 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28166 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28168 if (Idx == SM_SentinelZero)
28169 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28170 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28171 if (Idx == SM_SentinelUndef)
28172 return DAG.getUNDEF(EltVT);
28174 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28175 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28178 // If inputs to shuffle are the same for both ops, then allow 2 uses
28179 unsigned AllowedUses =
28180 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28182 if (LdNode.getOpcode() == ISD::BITCAST) {
28183 // Don't duplicate a load with other uses.
28184 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28187 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28188 LdNode = LdNode.getOperand(0);
28191 if (!ISD::isNormalLoad(LdNode.getNode()))
28194 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28196 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28199 // If there's a bitcast before the shuffle, check if the load type and
28200 // alignment is valid.
28201 unsigned Align = LN0->getAlignment();
28202 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28203 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28204 EltVT.getTypeForEVT(*DAG.getContext()));
28206 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28209 // All checks match so transform back to vector_shuffle so that DAG combiner
28210 // can finish the job
28213 // Create shuffle node taking into account the case that its a unary shuffle
28214 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28215 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28217 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28218 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28222 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28223 const X86Subtarget &Subtarget) {
28224 SDValue N0 = N->getOperand(0);
28225 EVT VT = N->getValueType(0);
28227 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
28228 // special and don't usually play with other vector types, it's better to
28229 // handle them early to be sure we emit efficient code by avoiding
28230 // store-load conversions.
28231 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28232 N0.getValueType() == MVT::v2i32 &&
28233 isNullConstant(N0.getOperand(1))) {
28234 SDValue N00 = N0->getOperand(0);
28235 if (N00.getValueType() == MVT::i32)
28236 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28239 // Convert a bitcasted integer logic operation that has one bitcasted
28240 // floating-point operand into a floating-point logic operation. This may
28241 // create a load of a constant, but that is cheaper than materializing the
28242 // constant in an integer register and transferring it to an SSE register or
28243 // transferring the SSE operand to integer register and back.
28245 switch (N0.getOpcode()) {
28246 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28247 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28248 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28249 default: return SDValue();
28252 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
28253 (Subtarget.hasSSE2() && VT == MVT::f64)))
28256 SDValue LogicOp0 = N0.getOperand(0);
28257 SDValue LogicOp1 = N0.getOperand(1);
28260 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
28261 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
28262 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
28263 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
28264 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
28265 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
28267 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
28268 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
28269 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
28270 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
28271 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
28272 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
28278 // Match a binop + shuffle pyramid that represents a horizontal reduction over
28279 // the elements of a vector.
28280 // Returns the vector that is being reduced on, or SDValue() if a reduction
28281 // was not matched.
28282 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
28283 // The pattern must end in an extract from index 0.
28284 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
28285 !isNullConstant(Extract->getOperand(1)))
28289 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
28291 SDValue Op = Extract->getOperand(0);
28292 // At each stage, we're looking for something that looks like:
28293 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
28294 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
28295 // i32 undef, i32 undef, i32 undef, i32 undef>
28296 // %a = binop <8 x i32> %op, %s
28297 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
28298 // we expect something like:
28299 // <4,5,6,7,u,u,u,u>
28300 // <2,3,u,u,u,u,u,u>
28301 // <1,u,u,u,u,u,u,u>
28302 for (unsigned i = 0; i < Stages; ++i) {
28303 if (Op.getOpcode() != BinOp)
28306 ShuffleVectorSDNode *Shuffle =
28307 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
28309 Op = Op.getOperand(1);
28311 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
28312 Op = Op.getOperand(0);
28315 // The first operand of the shuffle should be the same as the other operand
28317 if (!Shuffle || (Shuffle->getOperand(0) != Op))
28320 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
28321 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
28322 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
28329 // Given a select, detect the following pattern:
28330 // 1: %2 = zext <N x i8> %0 to <N x i32>
28331 // 2: %3 = zext <N x i8> %1 to <N x i32>
28332 // 3: %4 = sub nsw <N x i32> %2, %3
28333 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
28334 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
28335 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
28336 // This is useful as it is the input into a SAD pattern.
28337 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
28339 // Check the condition of the select instruction is greater-than.
28340 SDValue SetCC = Select->getOperand(0);
28341 if (SetCC.getOpcode() != ISD::SETCC)
28343 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
28344 if (CC != ISD::SETGT)
28347 SDValue SelectOp1 = Select->getOperand(1);
28348 SDValue SelectOp2 = Select->getOperand(2);
28350 // The second operand of the select should be the negation of the first
28351 // operand, which is implemented as 0 - SelectOp1.
28352 if (!(SelectOp2.getOpcode() == ISD::SUB &&
28353 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
28354 SelectOp2.getOperand(1) == SelectOp1))
28357 // The first operand of SetCC is the first operand of the select, which is the
28358 // difference between the two input vectors.
28359 if (SetCC.getOperand(0) != SelectOp1)
28362 // The second operand of the comparison can be either -1 or 0.
28363 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
28364 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
28367 // The first operand of the select is the difference between the two input
28369 if (SelectOp1.getOpcode() != ISD::SUB)
28372 Op0 = SelectOp1.getOperand(0);
28373 Op1 = SelectOp1.getOperand(1);
28375 // Check if the operands of the sub are zero-extended from vectors of i8.
28376 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
28377 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
28378 Op1.getOpcode() != ISD::ZERO_EXTEND ||
28379 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
28385 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
28387 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
28388 const SDValue &Zext1, const SDLoc &DL) {
28390 // Find the appropriate width for the PSADBW.
28391 EVT InVT = Zext0.getOperand(0).getValueType();
28392 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
28394 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
28395 // fill in the missing vector elements with 0.
28396 unsigned NumConcat = RegSize / InVT.getSizeInBits();
28397 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
28398 Ops[0] = Zext0.getOperand(0);
28399 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
28400 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28401 Ops[0] = Zext1.getOperand(0);
28402 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28404 // Actually build the SAD
28405 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
28406 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
28409 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
28410 const X86Subtarget &Subtarget) {
28411 // PSADBW is only supported on SSE2 and up.
28412 if (!Subtarget.hasSSE2())
28415 // Verify the type we're extracting from is appropriate
28416 // TODO: There's nothing special about i32, any integer type above i16 should
28417 // work just as well.
28418 EVT VT = Extract->getOperand(0).getValueType();
28419 if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
28422 unsigned RegSize = 128;
28423 if (Subtarget.hasBWI())
28425 else if (Subtarget.hasAVX2())
28428 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
28429 // TODO: We should be able to handle larger vectors by splitting them before
28430 // feeding them into several SADs, and then reducing over those.
28431 if (VT.getSizeInBits() / 4 > RegSize)
28434 // Match shuffle + add pyramid.
28435 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
28437 // If there was a match, we want Root to be a select that is the root of an
28438 // abs-diff pattern.
28439 if (!Root || (Root.getOpcode() != ISD::VSELECT))
28442 // Check whether we have an abs-diff pattern feeding into the select.
28443 SDValue Zext0, Zext1;
28444 if (!detectZextAbsDiff(Root, Zext0, Zext1))
28447 // Create the SAD instruction
28449 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
28451 // If the original vector was wider than 8 elements, sum over the results
28452 // in the SAD vector.
28453 unsigned Stages = Log2_32(VT.getVectorNumElements());
28454 MVT SadVT = SAD.getSimpleValueType();
28456 unsigned SadElems = SadVT.getVectorNumElements();
28458 for(unsigned i = Stages - 3; i > 0; --i) {
28459 SmallVector<int, 16> Mask(SadElems, -1);
28460 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
28461 Mask[j] = MaskEnd + j;
28464 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
28465 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
28469 // Return the lowest i32.
28470 MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
28471 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
28472 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
28473 Extract->getOperand(1));
28476 /// Detect vector gather/scatter index generation and convert it from being a
28477 /// bunch of shuffles and extracts into a somewhat faster sequence.
28478 /// For i686, the best sequence is apparently storing the value and loading
28479 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
28480 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
28481 TargetLowering::DAGCombinerInfo &DCI,
28482 const X86Subtarget &Subtarget) {
28483 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
28486 SDValue InputVector = N->getOperand(0);
28487 SDLoc dl(InputVector);
28488 // Detect mmx to i32 conversion through a v2i32 elt extract.
28489 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
28490 N->getValueType(0) == MVT::i32 &&
28491 InputVector.getValueType() == MVT::v2i32 &&
28492 isa<ConstantSDNode>(N->getOperand(1)) &&
28493 N->getConstantOperandVal(1) == 0) {
28494 SDValue MMXSrc = InputVector.getOperand(0);
28496 // The bitcast source is a direct mmx result.
28497 if (MMXSrc.getValueType() == MVT::x86mmx)
28498 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
28501 EVT VT = N->getValueType(0);
28503 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
28504 InputVector.getOpcode() == ISD::BITCAST &&
28505 isa<ConstantSDNode>(InputVector.getOperand(0))) {
28506 uint64_t ExtractedElt =
28507 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
28508 uint64_t InputValue =
28509 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
28510 uint64_t Res = (InputValue >> ExtractedElt) & 1;
28511 return DAG.getConstant(Res, dl, MVT::i1);
28514 // Check whether this extract is the root of a sum of absolute differences
28515 // pattern. This has to be done here because we really want it to happen
28516 // pre-legalization,
28517 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
28520 // Only operate on vectors of 4 elements, where the alternative shuffling
28521 // gets to be more expensive.
28522 if (InputVector.getValueType() != MVT::v4i32)
28525 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
28526 // single use which is a sign-extend or zero-extend, and all elements are
28528 SmallVector<SDNode *, 4> Uses;
28529 unsigned ExtractedElements = 0;
28530 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
28531 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
28532 if (UI.getUse().getResNo() != InputVector.getResNo())
28535 SDNode *Extract = *UI;
28536 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
28539 if (Extract->getValueType(0) != MVT::i32)
28541 if (!Extract->hasOneUse())
28543 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
28544 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
28546 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
28549 // Record which element was extracted.
28550 ExtractedElements |=
28551 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
28553 Uses.push_back(Extract);
28556 // If not all the elements were used, this may not be worthwhile.
28557 if (ExtractedElements != 15)
28560 // Ok, we've now decided to do the transformation.
28561 // If 64-bit shifts are legal, use the extract-shift sequence,
28562 // otherwise bounce the vector off the cache.
28563 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28566 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
28567 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
28568 auto &DL = DAG.getDataLayout();
28569 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
28570 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28571 DAG.getConstant(0, dl, VecIdxTy));
28572 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28573 DAG.getConstant(1, dl, VecIdxTy));
28575 SDValue ShAmt = DAG.getConstant(
28576 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
28577 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
28578 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28579 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
28580 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
28581 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28582 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
28584 // Store the value to a temporary stack slot.
28585 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
28586 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
28587 MachinePointerInfo());
28589 EVT ElementType = InputVector.getValueType().getVectorElementType();
28590 unsigned EltSize = ElementType.getSizeInBits() / 8;
28592 // Replace each use (extract) with a load of the appropriate element.
28593 for (unsigned i = 0; i < 4; ++i) {
28594 uint64_t Offset = EltSize * i;
28595 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28596 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
28598 SDValue ScalarAddr =
28599 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
28601 // Load the scalar.
28603 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
28607 // Replace the extracts
28608 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
28609 UE = Uses.end(); UI != UE; ++UI) {
28610 SDNode *Extract = *UI;
28612 SDValue Idx = Extract->getOperand(1);
28613 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
28614 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
28617 // The replacement was made in place; don't return anything.
28621 /// If a vector select has an operand that is -1 or 0, simplify the select to a
28622 /// bitwise logic operation.
28623 static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
28624 const X86Subtarget &Subtarget) {
28625 SDValue Cond = N->getOperand(0);
28626 SDValue LHS = N->getOperand(1);
28627 SDValue RHS = N->getOperand(2);
28628 EVT VT = LHS.getValueType();
28629 EVT CondVT = Cond.getValueType();
28631 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28633 if (N->getOpcode() != ISD::VSELECT)
28636 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28637 // Check if the first operand is all zeros.This situation only
28638 // applies to avx512.
28639 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
28640 //Invert the cond to not(cond) : xor(op,allones)=not(op)
28641 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28642 DAG.getConstant(1, DL, Cond.getValueType()));
28643 //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
28644 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
28646 assert(CondVT.isVector() && "Vector select expects a vector selector!");
28648 // To use the condition operand as a bitwise mask, it must have elements that
28649 // are the same size as the select elements. Ie, the condition operand must
28650 // have already been promoted from the IR select condition type <N x i1>.
28651 // Don't check if the types themselves are equal because that excludes
28652 // vector floating-point selects.
28653 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
28656 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
28657 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
28659 // Try to invert the condition if true value is not all 1s and false value is
28661 if (!TValIsAllOnes && !FValIsAllZeros &&
28662 // Check if the selector will be produced by CMPP*/PCMP*.
28663 Cond.getOpcode() == ISD::SETCC &&
28664 // Check if SETCC has already been promoted.
28665 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
28667 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28668 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
28670 if (TValIsAllZeros || FValIsAllOnes) {
28671 SDValue CC = Cond.getOperand(2);
28672 ISD::CondCode NewCC =
28673 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
28674 Cond.getOperand(0).getValueType().isInteger());
28675 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
28677 std::swap(LHS, RHS);
28678 TValIsAllOnes = FValIsAllOnes;
28679 FValIsAllZeros = TValIsAllZeros;
28683 if (!TValIsAllOnes && !FValIsAllZeros)
28687 if (TValIsAllOnes && FValIsAllZeros)
28689 else if (TValIsAllOnes)
28690 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
28691 else if (FValIsAllZeros)
28692 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS));
28694 return DAG.getBitcast(VT, Ret);
28697 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
28698 SDValue Cond = N->getOperand(0);
28699 SDValue LHS = N->getOperand(1);
28700 SDValue RHS = N->getOperand(2);
28703 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
28704 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
28705 if (!TrueC || !FalseC)
28708 // Don't do this for crazy integer types.
28709 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
28712 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
28713 // so that TrueC (the true value) is larger than FalseC.
28714 bool NeedsCondInvert = false;
28715 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
28716 // Efficiently invertible.
28717 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
28718 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
28719 isa<ConstantSDNode>(Cond.getOperand(1))))) {
28720 NeedsCondInvert = true;
28721 std::swap(TrueC, FalseC);
28724 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
28725 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
28726 if (NeedsCondInvert) // Invert the condition if needed.
28727 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28728 DAG.getConstant(1, DL, Cond.getValueType()));
28730 // Zero extend the condition if needed.
28731 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
28733 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
28734 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
28735 DAG.getConstant(ShAmt, DL, MVT::i8));
28738 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
28739 if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
28740 if (NeedsCondInvert) // Invert the condition if needed.
28741 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28742 DAG.getConstant(1, DL, Cond.getValueType()));
28744 // Zero extend the condition if needed.
28745 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28746 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28747 SDValue(FalseC, 0));
28750 // Optimize cases that will turn into an LEA instruction. This requires
28751 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
28752 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
28753 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
28754 if (N->getValueType(0) == MVT::i32)
28755 Diff = (unsigned)Diff;
28757 bool isFastMultiplier = false;
28759 switch ((unsigned char)Diff) {
28762 case 1: // result = add base, cond
28763 case 2: // result = lea base( , cond*2)
28764 case 3: // result = lea base(cond, cond*2)
28765 case 4: // result = lea base( , cond*4)
28766 case 5: // result = lea base(cond, cond*4)
28767 case 8: // result = lea base( , cond*8)
28768 case 9: // result = lea base(cond, cond*8)
28769 isFastMultiplier = true;
28774 if (isFastMultiplier) {
28775 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
28776 if (NeedsCondInvert) // Invert the condition if needed.
28777 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28778 DAG.getConstant(1, DL, Cond.getValueType()));
28780 // Zero extend the condition if needed.
28781 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28782 // Scale the condition by the difference.
28784 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
28785 DAG.getConstant(Diff, DL, Cond.getValueType()));
28787 // Add the base if non-zero.
28788 if (FalseC->getAPIntValue() != 0)
28789 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28790 SDValue(FalseC, 0));
28798 // If this is a bitcasted op that can be represented as another type, push the
28799 // the bitcast to the inputs. This allows more opportunities for pattern
28800 // matching masked instructions. This is called when we know that the operation
28801 // is used as one of the inputs of a vselect.
28802 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
28803 TargetLowering::DAGCombinerInfo &DCI) {
28804 // Make sure we have a bitcast.
28805 if (OrigOp.getOpcode() != ISD::BITCAST)
28808 SDValue Op = OrigOp.getOperand(0);
28810 // If the operation is used by anything other than the bitcast, we shouldn't
28811 // do this combine as that would replicate the operation.
28812 if (!Op.hasOneUse())
28815 MVT VT = OrigOp.getSimpleValueType();
28816 MVT EltVT = VT.getVectorElementType();
28817 SDLoc DL(Op.getNode());
28819 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
28821 Op0 = DAG.getBitcast(VT, Op0);
28822 DCI.AddToWorklist(Op0.getNode());
28823 Op1 = DAG.getBitcast(VT, Op1);
28824 DCI.AddToWorklist(Op1.getNode());
28825 DCI.CombineTo(OrigOp.getNode(),
28826 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
28830 unsigned Opcode = Op.getOpcode();
28832 case X86ISD::PALIGNR:
28833 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
28834 if (!VT.is128BitVector())
28836 Opcode = X86ISD::VALIGN;
28838 case X86ISD::VALIGN: {
28839 if (EltVT != MVT::i32 && EltVT != MVT::i64)
28841 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
28842 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28843 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
28844 unsigned EltSize = EltVT.getSizeInBits();
28845 // Make sure we can represent the same shift with the new VT.
28846 if ((ShiftAmt % EltSize) != 0)
28848 Imm = ShiftAmt / EltSize;
28849 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28850 DAG.getConstant(Imm, DL, MVT::i8));
28852 case X86ISD::SHUF128: {
28853 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
28855 // Only change element size, not type.
28856 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
28858 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28861 case ISD::INSERT_SUBVECTOR: {
28862 unsigned EltSize = EltVT.getSizeInBits();
28863 if (EltSize != 32 && EltSize != 64)
28865 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28866 // Only change element size, not type.
28867 if (VT.isInteger() != OpEltVT.isInteger())
28869 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
28870 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
28871 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
28872 DCI.AddToWorklist(Op0.getNode());
28873 // Op1 needs to be bitcasted to a smaller vector with the same element type.
28874 SDValue Op1 = Op.getOperand(1);
28875 MVT Op1VT = MVT::getVectorVT(EltVT,
28876 Op1.getSimpleValueType().getSizeInBits() / EltSize);
28877 Op1 = DAG.getBitcast(Op1VT, Op1);
28878 DCI.AddToWorklist(Op1.getNode());
28879 DCI.CombineTo(OrigOp.getNode(),
28880 DAG.getNode(Opcode, DL, VT, Op0, Op1,
28881 DAG.getConstant(Imm, DL, MVT::i8)));
28884 case ISD::EXTRACT_SUBVECTOR: {
28885 unsigned EltSize = EltVT.getSizeInBits();
28886 if (EltSize != 32 && EltSize != 64)
28888 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28889 // Only change element size, not type.
28890 if (VT.isInteger() != OpEltVT.isInteger())
28892 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
28893 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
28894 // Op0 needs to be bitcasted to a larger vector with the same element type.
28895 SDValue Op0 = Op.getOperand(0);
28896 MVT Op0VT = MVT::getVectorVT(EltVT,
28897 Op0.getSimpleValueType().getSizeInBits() / EltSize);
28898 Op0 = DAG.getBitcast(Op0VT, Op0);
28899 DCI.AddToWorklist(Op0.getNode());
28900 DCI.CombineTo(OrigOp.getNode(),
28901 DAG.getNode(Opcode, DL, VT, Op0,
28902 DAG.getConstant(Imm, DL, MVT::i8)));
28910 /// Do target-specific dag combines on SELECT and VSELECT nodes.
28911 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
28912 TargetLowering::DAGCombinerInfo &DCI,
28913 const X86Subtarget &Subtarget) {
28915 SDValue Cond = N->getOperand(0);
28916 // Get the LHS/RHS of the select.
28917 SDValue LHS = N->getOperand(1);
28918 SDValue RHS = N->getOperand(2);
28919 EVT VT = LHS.getValueType();
28920 EVT CondVT = Cond.getValueType();
28921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28923 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
28924 // instructions match the semantics of the common C idiom x<y?x:y but not
28925 // x<=y?x:y, because of how they handle negative zero (which can be
28926 // ignored in unsafe-math mode).
28927 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
28928 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
28929 VT != MVT::f80 && VT != MVT::f128 &&
28930 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
28931 (Subtarget.hasSSE2() ||
28932 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
28933 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28935 unsigned Opcode = 0;
28936 // Check for x CC y ? x : y.
28937 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
28938 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
28942 // Converting this to a min would handle NaNs incorrectly, and swapping
28943 // the operands would cause it to handle comparisons between positive
28944 // and negative zero incorrectly.
28945 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28946 if (!DAG.getTarget().Options.UnsafeFPMath &&
28947 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28949 std::swap(LHS, RHS);
28951 Opcode = X86ISD::FMIN;
28954 // Converting this to a min would handle comparisons between positive
28955 // and negative zero incorrectly.
28956 if (!DAG.getTarget().Options.UnsafeFPMath &&
28957 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28959 Opcode = X86ISD::FMIN;
28962 // Converting this to a min would handle both negative zeros and NaNs
28963 // incorrectly, but we can swap the operands to fix both.
28964 std::swap(LHS, RHS);
28968 Opcode = X86ISD::FMIN;
28972 // Converting this to a max would handle comparisons between positive
28973 // and negative zero incorrectly.
28974 if (!DAG.getTarget().Options.UnsafeFPMath &&
28975 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28977 Opcode = X86ISD::FMAX;
28980 // Converting this to a max would handle NaNs incorrectly, and swapping
28981 // the operands would cause it to handle comparisons between positive
28982 // and negative zero incorrectly.
28983 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28984 if (!DAG.getTarget().Options.UnsafeFPMath &&
28985 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28987 std::swap(LHS, RHS);
28989 Opcode = X86ISD::FMAX;
28992 // Converting this to a max would handle both negative zeros and NaNs
28993 // incorrectly, but we can swap the operands to fix both.
28994 std::swap(LHS, RHS);
28998 Opcode = X86ISD::FMAX;
29001 // Check for x CC y ? y : x -- a min/max with reversed arms.
29002 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
29003 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
29007 // Converting this to a min would handle comparisons between positive
29008 // and negative zero incorrectly, and swapping the operands would
29009 // cause it to handle NaNs incorrectly.
29010 if (!DAG.getTarget().Options.UnsafeFPMath &&
29011 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
29012 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29014 std::swap(LHS, RHS);
29016 Opcode = X86ISD::FMIN;
29019 // Converting this to a min would handle NaNs incorrectly.
29020 if (!DAG.getTarget().Options.UnsafeFPMath &&
29021 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
29023 Opcode = X86ISD::FMIN;
29026 // Converting this to a min would handle both negative zeros and NaNs
29027 // incorrectly, but we can swap the operands to fix both.
29028 std::swap(LHS, RHS);
29032 Opcode = X86ISD::FMIN;
29036 // Converting this to a max would handle NaNs incorrectly.
29037 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29039 Opcode = X86ISD::FMAX;
29042 // Converting this to a max would handle comparisons between positive
29043 // and negative zero incorrectly, and swapping the operands would
29044 // cause it to handle NaNs incorrectly.
29045 if (!DAG.getTarget().Options.UnsafeFPMath &&
29046 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
29047 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29049 std::swap(LHS, RHS);
29051 Opcode = X86ISD::FMAX;
29054 // Converting this to a max would handle both negative zeros and NaNs
29055 // incorrectly, but we can swap the operands to fix both.
29056 std::swap(LHS, RHS);
29060 Opcode = X86ISD::FMAX;
29066 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
29069 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
29070 // lowering on KNL. In this case we convert it to
29071 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
29072 // The same situation for all 128 and 256-bit vectors of i8 and i16.
29073 // Since SKX these selects have a proper lowering.
29074 if (Subtarget.hasAVX512() && CondVT.isVector() &&
29075 CondVT.getVectorElementType() == MVT::i1 &&
29076 (VT.is128BitVector() || VT.is256BitVector()) &&
29077 (VT.getVectorElementType() == MVT::i8 ||
29078 VT.getVectorElementType() == MVT::i16) &&
29079 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
29080 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
29081 DCI.AddToWorklist(Cond.getNode());
29082 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
29085 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
29088 // Canonicalize max and min:
29089 // (x > y) ? x : y -> (x >= y) ? x : y
29090 // (x < y) ? x : y -> (x <= y) ? x : y
29091 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
29092 // the need for an extra compare
29093 // against zero. e.g.
29094 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
29096 // testl %edi, %edi
29098 // cmovgl %edi, %eax
29102 // cmovsl %eax, %edi
29103 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
29104 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29105 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29106 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29111 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
29112 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
29113 Cond.getOperand(0), Cond.getOperand(1), NewCC);
29114 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
29119 // Early exit check
29120 if (!TLI.isTypeLegal(VT))
29123 // Match VSELECTs into subs with unsigned saturation.
29124 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
29125 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
29126 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
29127 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
29128 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29130 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
29131 // left side invert the predicate to simplify logic below.
29133 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
29135 CC = ISD::getSetCCInverse(CC, true);
29136 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
29140 if (Other.getNode() && Other->getNumOperands() == 2 &&
29141 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
29142 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
29143 SDValue CondRHS = Cond->getOperand(1);
29145 // Look for a general sub with unsigned saturation first.
29146 // x >= y ? x-y : 0 --> subus x, y
29147 // x > y ? x-y : 0 --> subus x, y
29148 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
29149 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
29150 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
29152 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
29153 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
29154 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
29155 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
29156 // If the RHS is a constant we have to reverse the const
29157 // canonicalization.
29158 // x > C-1 ? x+-C : 0 --> subus x, C
29159 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
29160 CondRHSConst->getAPIntValue() ==
29161 (-OpRHSConst->getAPIntValue() - 1))
29162 return DAG.getNode(
29163 X86ISD::SUBUS, DL, VT, OpLHS,
29164 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
29166 // Another special case: If C was a sign bit, the sub has been
29167 // canonicalized into a xor.
29168 // FIXME: Would it be better to use computeKnownBits to determine
29169 // whether it's safe to decanonicalize the xor?
29170 // x s< 0 ? x^C : 0 --> subus x, C
29171 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
29172 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
29173 OpRHSConst->getAPIntValue().isSignBit())
29174 // Note that we have to rebuild the RHS constant here to ensure we
29175 // don't rely on particular values of undef lanes.
29176 return DAG.getNode(
29177 X86ISD::SUBUS, DL, VT, OpLHS,
29178 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
29183 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, Subtarget))
29186 // If this is a *dynamic* select (non-constant condition) and we can match
29187 // this node with one of the variable blend instructions, restructure the
29188 // condition so that the blends can use the high bit of each element and use
29189 // SimplifyDemandedBits to simplify the condition operand.
29190 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
29191 !DCI.isBeforeLegalize() &&
29192 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
29193 unsigned BitWidth = Cond.getScalarValueSizeInBits();
29195 // Don't optimize vector selects that map to mask-registers.
29199 // We can only handle the cases where VSELECT is directly legal on the
29200 // subtarget. We custom lower VSELECT nodes with constant conditions and
29201 // this makes it hard to see whether a dynamic VSELECT will correctly
29202 // lower, so we both check the operation's status and explicitly handle the
29203 // cases where a *dynamic* blend will fail even though a constant-condition
29204 // blend could be custom lowered.
29205 // FIXME: We should find a better way to handle this class of problems.
29206 // Potentially, we should combine constant-condition vselect nodes
29207 // pre-legalization into shuffles and not mark as many types as custom
29209 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
29211 // FIXME: We don't support i16-element blends currently. We could and
29212 // should support them by making *all* the bits in the condition be set
29213 // rather than just the high bit and using an i8-element blend.
29214 if (VT.getVectorElementType() == MVT::i16)
29216 // Dynamic blending was only available from SSE4.1 onward.
29217 if (VT.is128BitVector() && !Subtarget.hasSSE41())
29219 // Byte blends are only available in AVX2
29220 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
29223 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
29224 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
29226 APInt KnownZero, KnownOne;
29227 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
29228 DCI.isBeforeLegalizeOps());
29229 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
29230 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
29232 // If we changed the computation somewhere in the DAG, this change
29233 // will affect all users of Cond.
29234 // Make sure it is fine and update all the nodes so that we do not
29235 // use the generic VSELECT anymore. Otherwise, we may perform
29236 // wrong optimizations as we messed up with the actual expectation
29237 // for the vector boolean values.
29238 if (Cond != TLO.Old) {
29239 // Check all uses of that condition operand to check whether it will be
29240 // consumed by non-BLEND instructions, which may depend on all bits are
29242 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29244 if (I->getOpcode() != ISD::VSELECT)
29245 // TODO: Add other opcodes eventually lowered into BLEND.
29248 // Update all the users of the condition, before committing the change,
29249 // so that the VSELECT optimizations that expect the correct vector
29250 // boolean value will not be triggered.
29251 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29253 DAG.ReplaceAllUsesOfValueWith(
29255 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
29256 Cond, I->getOperand(1), I->getOperand(2)));
29257 DCI.CommitTargetLoweringOpt(TLO);
29260 // At this point, only Cond is changed. Change the condition
29261 // just for N to keep the opportunity to optimize all other
29262 // users their own way.
29263 DAG.ReplaceAllUsesOfValueWith(
29265 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
29266 TLO.New, N->getOperand(1), N->getOperand(2)));
29271 // Look for vselects with LHS/RHS being bitcasted from an operation that
29272 // can be executed on another type. Push the bitcast to the inputs of
29273 // the operation. This exposes opportunities for using masking instructions.
29274 if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
29275 CondVT.getVectorElementType() == MVT::i1) {
29276 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
29277 return SDValue(N, 0);
29278 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
29279 return SDValue(N, 0);
29286 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
29288 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
29289 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
29290 /// Note that this is only legal for some op/cc combinations.
29291 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
29292 SelectionDAG &DAG) {
29293 // This combine only operates on CMP-like nodes.
29294 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29295 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29298 // This only applies to variations of the common case:
29299 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
29300 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
29301 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
29302 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
29303 // Using the proper condcodes (see below), overflow is checked for.
29305 // FIXME: We can generalize both constraints:
29306 // - XOR/OR/AND (if they were made to survive AtomicExpand)
29308 // if the result is compared.
29310 SDValue CmpLHS = Cmp.getOperand(0);
29311 SDValue CmpRHS = Cmp.getOperand(1);
29313 if (!CmpLHS.hasOneUse())
29316 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
29317 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
29320 const unsigned Opc = CmpLHS.getOpcode();
29322 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
29325 SDValue OpRHS = CmpLHS.getOperand(2);
29326 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
29330 APInt Addend = OpRHSC->getAPIntValue();
29331 if (Opc == ISD::ATOMIC_LOAD_SUB)
29334 if (CC == X86::COND_S && Addend == 1)
29336 else if (CC == X86::COND_NS && Addend == 1)
29338 else if (CC == X86::COND_G && Addend == -1)
29340 else if (CC == X86::COND_LE && Addend == -1)
29345 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
29346 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
29347 DAG.getUNDEF(CmpLHS.getValueType()));
29348 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
29352 // Check whether a boolean test is testing a boolean value generated by
29353 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
29356 // Simplify the following patterns:
29357 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
29358 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
29359 // to (Op EFLAGS Cond)
29361 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
29362 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
29363 // to (Op EFLAGS !Cond)
29365 // where Op could be BRCOND or CMOV.
29367 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
29368 // This combine only operates on CMP-like nodes.
29369 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29370 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29373 // Quit if not used as a boolean value.
29374 if (CC != X86::COND_E && CC != X86::COND_NE)
29377 // Check CMP operands. One of them should be 0 or 1 and the other should be
29378 // an SetCC or extended from it.
29379 SDValue Op1 = Cmp.getOperand(0);
29380 SDValue Op2 = Cmp.getOperand(1);
29383 const ConstantSDNode* C = nullptr;
29384 bool needOppositeCond = (CC == X86::COND_E);
29385 bool checkAgainstTrue = false; // Is it a comparison against 1?
29387 if ((C = dyn_cast<ConstantSDNode>(Op1)))
29389 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
29391 else // Quit if all operands are not constants.
29394 if (C->getZExtValue() == 1) {
29395 needOppositeCond = !needOppositeCond;
29396 checkAgainstTrue = true;
29397 } else if (C->getZExtValue() != 0)
29398 // Quit if the constant is neither 0 or 1.
29401 bool truncatedToBoolWithAnd = false;
29402 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
29403 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
29404 SetCC.getOpcode() == ISD::TRUNCATE ||
29405 SetCC.getOpcode() == ISD::AND) {
29406 if (SetCC.getOpcode() == ISD::AND) {
29408 if (isOneConstant(SetCC.getOperand(0)))
29410 if (isOneConstant(SetCC.getOperand(1)))
29414 SetCC = SetCC.getOperand(OpIdx);
29415 truncatedToBoolWithAnd = true;
29417 SetCC = SetCC.getOperand(0);
29420 switch (SetCC.getOpcode()) {
29421 case X86ISD::SETCC_CARRY:
29422 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
29423 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
29424 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
29425 // truncated to i1 using 'and'.
29426 if (checkAgainstTrue && !truncatedToBoolWithAnd)
29428 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
29429 "Invalid use of SETCC_CARRY!");
29431 case X86ISD::SETCC:
29432 // Set the condition code or opposite one if necessary.
29433 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
29434 if (needOppositeCond)
29435 CC = X86::GetOppositeBranchCondition(CC);
29436 return SetCC.getOperand(1);
29437 case X86ISD::CMOV: {
29438 // Check whether false/true value has canonical one, i.e. 0 or 1.
29439 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
29440 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
29441 // Quit if true value is not a constant.
29444 // Quit if false value is not a constant.
29446 SDValue Op = SetCC.getOperand(0);
29447 // Skip 'zext' or 'trunc' node.
29448 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
29449 Op.getOpcode() == ISD::TRUNCATE)
29450 Op = Op.getOperand(0);
29451 // A special case for rdrand/rdseed, where 0 is set if false cond is
29453 if ((Op.getOpcode() != X86ISD::RDRAND &&
29454 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
29457 // Quit if false value is not the constant 0 or 1.
29458 bool FValIsFalse = true;
29459 if (FVal && FVal->getZExtValue() != 0) {
29460 if (FVal->getZExtValue() != 1)
29462 // If FVal is 1, opposite cond is needed.
29463 needOppositeCond = !needOppositeCond;
29464 FValIsFalse = false;
29466 // Quit if TVal is not the constant opposite of FVal.
29467 if (FValIsFalse && TVal->getZExtValue() != 1)
29469 if (!FValIsFalse && TVal->getZExtValue() != 0)
29471 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
29472 if (needOppositeCond)
29473 CC = X86::GetOppositeBranchCondition(CC);
29474 return SetCC.getOperand(3);
29481 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
29483 /// (X86or (X86setcc) (X86setcc))
29484 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
29485 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
29486 X86::CondCode &CC1, SDValue &Flags,
29488 if (Cond->getOpcode() == X86ISD::CMP) {
29489 if (!isNullConstant(Cond->getOperand(1)))
29492 Cond = Cond->getOperand(0);
29497 SDValue SetCC0, SetCC1;
29498 switch (Cond->getOpcode()) {
29499 default: return false;
29506 SetCC0 = Cond->getOperand(0);
29507 SetCC1 = Cond->getOperand(1);
29511 // Make sure we have SETCC nodes, using the same flags value.
29512 if (SetCC0.getOpcode() != X86ISD::SETCC ||
29513 SetCC1.getOpcode() != X86ISD::SETCC ||
29514 SetCC0->getOperand(1) != SetCC1->getOperand(1))
29517 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
29518 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
29519 Flags = SetCC0->getOperand(1);
29523 /// Optimize an EFLAGS definition used according to the condition code \p CC
29524 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
29525 /// uses of chain values.
29526 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
29527 SelectionDAG &DAG) {
29528 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
29530 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
29533 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
29534 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
29535 TargetLowering::DAGCombinerInfo &DCI,
29536 const X86Subtarget &Subtarget) {
29539 // If the flag operand isn't dead, don't touch this CMOV.
29540 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
29543 SDValue FalseOp = N->getOperand(0);
29544 SDValue TrueOp = N->getOperand(1);
29545 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
29546 SDValue Cond = N->getOperand(3);
29548 if (CC == X86::COND_E || CC == X86::COND_NE) {
29549 switch (Cond.getOpcode()) {
29553 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
29554 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
29555 return (CC == X86::COND_E) ? FalseOp : TrueOp;
29559 // Try to simplify the EFLAGS and condition code operands.
29560 // We can't always do this as FCMOV only supports a subset of X86 cond.
29561 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
29562 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
29563 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
29565 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29569 // If this is a select between two integer constants, try to do some
29570 // optimizations. Note that the operands are ordered the opposite of SELECT
29572 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
29573 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
29574 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
29575 // larger than FalseC (the false value).
29576 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
29577 CC = X86::GetOppositeBranchCondition(CC);
29578 std::swap(TrueC, FalseC);
29579 std::swap(TrueOp, FalseOp);
29582 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
29583 // This is efficient for any integer data type (including i8/i16) and
29585 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29586 Cond = getSETCC(CC, Cond, DL, DAG);
29588 // Zero extend the condition if needed.
29589 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
29591 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29592 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
29593 DAG.getConstant(ShAmt, DL, MVT::i8));
29594 if (N->getNumValues() == 2) // Dead flag value?
29595 return DCI.CombineTo(N, Cond, SDValue());
29599 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
29600 // for any integer data type, including i8/i16.
29601 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
29602 Cond = getSETCC(CC, Cond, DL, DAG);
29604 // Zero extend the condition if needed.
29605 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
29606 FalseC->getValueType(0), Cond);
29607 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29608 SDValue(FalseC, 0));
29610 if (N->getNumValues() == 2) // Dead flag value?
29611 return DCI.CombineTo(N, Cond, SDValue());
29615 // Optimize cases that will turn into an LEA instruction. This requires
29616 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29617 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29618 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
29619 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
29621 bool isFastMultiplier = false;
29623 switch ((unsigned char)Diff) {
29625 case 1: // result = add base, cond
29626 case 2: // result = lea base( , cond*2)
29627 case 3: // result = lea base(cond, cond*2)
29628 case 4: // result = lea base( , cond*4)
29629 case 5: // result = lea base(cond, cond*4)
29630 case 8: // result = lea base( , cond*8)
29631 case 9: // result = lea base(cond, cond*8)
29632 isFastMultiplier = true;
29637 if (isFastMultiplier) {
29638 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
29639 Cond = getSETCC(CC, Cond, DL ,DAG);
29640 // Zero extend the condition if needed.
29641 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
29643 // Scale the condition by the difference.
29645 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29646 DAG.getConstant(Diff, DL, Cond.getValueType()));
29648 // Add the base if non-zero.
29649 if (FalseC->getAPIntValue() != 0)
29650 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29651 SDValue(FalseC, 0));
29652 if (N->getNumValues() == 2) // Dead flag value?
29653 return DCI.CombineTo(N, Cond, SDValue());
29660 // Handle these cases:
29661 // (select (x != c), e, c) -> select (x != c), e, x),
29662 // (select (x == c), c, e) -> select (x == c), x, e)
29663 // where the c is an integer constant, and the "select" is the combination
29664 // of CMOV and CMP.
29666 // The rationale for this change is that the conditional-move from a constant
29667 // needs two instructions, however, conditional-move from a register needs
29668 // only one instruction.
29670 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
29671 // some instruction-combining opportunities. This opt needs to be
29672 // postponed as late as possible.
29674 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
29675 // the DCI.xxxx conditions are provided to postpone the optimization as
29676 // late as possible.
29678 ConstantSDNode *CmpAgainst = nullptr;
29679 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
29680 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
29681 !isa<ConstantSDNode>(Cond.getOperand(0))) {
29683 if (CC == X86::COND_NE &&
29684 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
29685 CC = X86::GetOppositeBranchCondition(CC);
29686 std::swap(TrueOp, FalseOp);
29689 if (CC == X86::COND_E &&
29690 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
29691 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
29692 DAG.getConstant(CC, DL, MVT::i8), Cond };
29693 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
29698 // Fold and/or of setcc's to double CMOV:
29699 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
29700 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
29702 // This combine lets us generate:
29703 // cmovcc1 (jcc1 if we don't have CMOV)
29709 // cmovne (jne if we don't have CMOV)
29710 // When we can't use the CMOV instruction, it might increase branch
29712 // When we can use CMOV, or when there is no mispredict, this improves
29713 // throughput and reduces register pressure.
29715 if (CC == X86::COND_NE) {
29717 X86::CondCode CC0, CC1;
29719 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
29721 std::swap(FalseOp, TrueOp);
29722 CC0 = X86::GetOppositeBranchCondition(CC0);
29723 CC1 = X86::GetOppositeBranchCondition(CC1);
29726 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
29728 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
29729 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
29730 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29731 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
29739 /// Different mul shrinking modes.
29740 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
29742 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
29743 EVT VT = N->getOperand(0).getValueType();
29744 if (VT.getScalarSizeInBits() != 32)
29747 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
29748 unsigned SignBits[2] = {1, 1};
29749 bool IsPositive[2] = {false, false};
29750 for (unsigned i = 0; i < 2; i++) {
29751 SDValue Opd = N->getOperand(i);
29753 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
29754 // compute signbits for it separately.
29755 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
29756 // For anyextend, it is safe to assume an appropriate number of leading
29758 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
29760 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
29765 IsPositive[i] = true;
29766 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
29767 // All the operands of BUILD_VECTOR need to be int constant.
29768 // Find the smallest value range which all the operands belong to.
29770 IsPositive[i] = true;
29771 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
29772 if (SubOp.isUndef())
29774 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
29777 APInt IntVal = CN->getAPIntValue();
29778 if (IntVal.isNegative())
29779 IsPositive[i] = false;
29780 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
29783 SignBits[i] = DAG.ComputeNumSignBits(Opd);
29784 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
29785 IsPositive[i] = true;
29789 bool AllPositive = IsPositive[0] && IsPositive[1];
29790 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
29791 // When ranges are from -128 ~ 127, use MULS8 mode.
29792 if (MinSignBits >= 25)
29794 // When ranges are from 0 ~ 255, use MULU8 mode.
29795 else if (AllPositive && MinSignBits >= 24)
29797 // When ranges are from -32768 ~ 32767, use MULS16 mode.
29798 else if (MinSignBits >= 17)
29800 // When ranges are from 0 ~ 65535, use MULU16 mode.
29801 else if (AllPositive && MinSignBits >= 16)
29808 /// When the operands of vector mul are extended from smaller size values,
29809 /// like i8 and i16, the type of mul may be shrinked to generate more
29810 /// efficient code. Two typical patterns are handled:
29812 /// %2 = sext/zext <N x i8> %1 to <N x i32>
29813 /// %4 = sext/zext <N x i8> %3 to <N x i32>
29814 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29815 /// %5 = mul <N x i32> %2, %4
29818 /// %2 = zext/sext <N x i16> %1 to <N x i32>
29819 /// %4 = zext/sext <N x i16> %3 to <N x i32>
29820 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29821 /// %5 = mul <N x i32> %2, %4
29823 /// There are four mul shrinking modes:
29824 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
29825 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
29826 /// generate pmullw+sext32 for it (MULS8 mode).
29827 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
29828 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
29829 /// generate pmullw+zext32 for it (MULU8 mode).
29830 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
29831 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
29832 /// generate pmullw+pmulhw for it (MULS16 mode).
29833 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
29834 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
29835 /// generate pmullw+pmulhuw for it (MULU16 mode).
29836 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
29837 const X86Subtarget &Subtarget) {
29838 // Check for legality
29839 // pmullw/pmulhw are not supported by SSE.
29840 if (!Subtarget.hasSSE2())
29843 // Check for profitability
29844 // pmulld is supported since SSE41. It is better to use pmulld
29845 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
29847 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
29848 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
29852 if (!canReduceVMulWidth(N, DAG, Mode))
29856 SDValue N0 = N->getOperand(0);
29857 SDValue N1 = N->getOperand(1);
29858 EVT VT = N->getOperand(0).getValueType();
29859 unsigned RegSize = 128;
29860 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
29862 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
29863 // Shrink the operands of mul.
29864 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
29865 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
29867 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
29868 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
29869 // lower part is needed.
29870 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
29871 if (Mode == MULU8 || Mode == MULS8) {
29872 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
29875 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29876 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
29877 // the higher part is also needed.
29878 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29879 ReducedVT, NewN0, NewN1);
29881 // Repack the lower part and higher part result of mul into a wider
29883 // Generate shuffle functioning as punpcklwd.
29884 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
29885 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29886 ShuffleMask[2 * i] = i;
29887 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
29890 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29891 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
29892 // Generate shuffle functioning as punpckhwd.
29893 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29894 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
29895 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
29898 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29899 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
29900 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
29903 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
29904 // to legalize the mul explicitly because implicit legalization for type
29905 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
29906 // instructions which will not exist when we explicitly legalize it by
29907 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
29908 // <4 x i16> undef).
29910 // Legalize the operands of mul.
29911 // FIXME: We may be able to handle non-concatenated vectors by insertion.
29912 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
29913 if ((RegSize % ReducedSizeInBits) != 0)
29916 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
29917 DAG.getUNDEF(ReducedVT));
29919 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29921 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29923 if (Mode == MULU8 || Mode == MULS8) {
29924 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
29926 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29928 // convert the type of mul result to VT.
29929 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29930 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
29931 : ISD::SIGN_EXTEND_VECTOR_INREG,
29933 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29934 DAG.getIntPtrConstant(0, DL));
29936 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
29937 // MULU16/MULS16, both parts are needed.
29938 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29939 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29940 OpsVT, NewN0, NewN1);
29942 // Repack the lower part and higher part result of mul into a wider
29943 // result. Make sure the type of mul result is VT.
29944 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29945 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
29946 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
29947 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29948 DAG.getIntPtrConstant(0, DL));
29953 /// Optimize a single multiply with constant into two operations in order to
29954 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
29955 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
29956 TargetLowering::DAGCombinerInfo &DCI,
29957 const X86Subtarget &Subtarget) {
29958 EVT VT = N->getValueType(0);
29959 if (DCI.isBeforeLegalize() && VT.isVector())
29960 return reduceVMULWidth(N, DAG, Subtarget);
29962 // An imul is usually smaller than the alternative sequence.
29963 if (DAG.getMachineFunction().getFunction()->optForMinSize())
29966 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
29969 if (VT != MVT::i64 && VT != MVT::i32)
29972 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
29975 uint64_t MulAmt = C->getZExtValue();
29976 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
29979 uint64_t MulAmt1 = 0;
29980 uint64_t MulAmt2 = 0;
29981 if ((MulAmt % 9) == 0) {
29983 MulAmt2 = MulAmt / 9;
29984 } else if ((MulAmt % 5) == 0) {
29986 MulAmt2 = MulAmt / 5;
29987 } else if ((MulAmt % 3) == 0) {
29989 MulAmt2 = MulAmt / 3;
29995 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
29997 if (isPowerOf2_64(MulAmt2) &&
29998 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
29999 // If second multiplifer is pow2, issue it first. We want the multiply by
30000 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
30002 std::swap(MulAmt1, MulAmt2);
30004 if (isPowerOf2_64(MulAmt1))
30005 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30006 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
30008 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30009 DAG.getConstant(MulAmt1, DL, VT));
30011 if (isPowerOf2_64(MulAmt2))
30012 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
30013 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
30015 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
30016 DAG.getConstant(MulAmt2, DL, VT));
30020 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
30021 && "Both cases that could cause potential overflows should have "
30022 "already been handled.");
30023 if (isPowerOf2_64(MulAmt - 1))
30024 // (mul x, 2^N + 1) => (add (shl x, N), x)
30025 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
30026 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30027 DAG.getConstant(Log2_64(MulAmt - 1), DL,
30030 else if (isPowerOf2_64(MulAmt + 1))
30031 // (mul x, 2^N - 1) => (sub (shl x, N), x)
30032 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
30034 DAG.getConstant(Log2_64(MulAmt + 1),
30035 DL, MVT::i8)), N->getOperand(0));
30039 // Do not add new nodes to DAG combiner worklist.
30040 DCI.CombineTo(N, NewMul, false);
30045 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
30046 SDValue N0 = N->getOperand(0);
30047 SDValue N1 = N->getOperand(1);
30048 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
30049 EVT VT = N0.getValueType();
30051 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
30052 // since the result of setcc_c is all zero's or all ones.
30053 if (VT.isInteger() && !VT.isVector() &&
30054 N1C && N0.getOpcode() == ISD::AND &&
30055 N0.getOperand(1).getOpcode() == ISD::Constant) {
30056 SDValue N00 = N0.getOperand(0);
30057 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
30058 const APInt &ShAmt = N1C->getAPIntValue();
30059 Mask = Mask.shl(ShAmt);
30060 bool MaskOK = false;
30061 // We can handle cases concerning bit-widening nodes containing setcc_c if
30062 // we carefully interrogate the mask to make sure we are semantics
30064 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
30065 // of the underlying setcc_c operation if the setcc_c was zero extended.
30066 // Consider the following example:
30067 // zext(setcc_c) -> i32 0x0000FFFF
30068 // c1 -> i32 0x0000FFFF
30069 // c2 -> i32 0x00000001
30070 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
30071 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
30072 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30074 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
30075 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
30077 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
30078 N00.getOpcode() == ISD::ANY_EXTEND) &&
30079 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
30080 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
30082 if (MaskOK && Mask != 0) {
30084 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
30088 // Hardware support for vector shifts is sparse which makes us scalarize the
30089 // vector operations in many cases. Also, on sandybridge ADD is faster than
30091 // (shl V, 1) -> add V,V
30092 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
30093 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
30094 assert(N0.getValueType().isVector() && "Invalid vector shift type");
30095 // We shift all of the values by one. In many cases we do not have
30096 // hardware support for this operation. This is better expressed as an ADD
30098 if (N1SplatC->getAPIntValue() == 1)
30099 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
30105 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
30106 SDValue N0 = N->getOperand(0);
30107 SDValue N1 = N->getOperand(1);
30108 EVT VT = N0.getValueType();
30109 unsigned Size = VT.getSizeInBits();
30111 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
30112 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
30113 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
30114 // depending on sign of (SarConst - [56,48,32,24,16])
30116 // sexts in X86 are MOVs. The MOVs have the same code size
30117 // as above SHIFTs (only SHIFT on 1 has lower code size).
30118 // However the MOVs have 2 advantages to a SHIFT:
30119 // 1. MOVs can write to a register that differs from source
30120 // 2. MOVs accept memory operands
30122 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
30123 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
30124 N0.getOperand(1).getOpcode() != ISD::Constant)
30127 SDValue N00 = N0.getOperand(0);
30128 SDValue N01 = N0.getOperand(1);
30129 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
30130 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
30131 EVT CVT = N1.getValueType();
30133 if (SarConst.isNegative())
30136 for (MVT SVT : MVT::integer_valuetypes()) {
30137 unsigned ShiftSize = SVT.getSizeInBits();
30138 // skipping types without corresponding sext/zext and
30139 // ShlConst that is not one of [56,48,32,24,16]
30140 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
30144 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
30145 SarConst = SarConst - (Size - ShiftSize);
30148 else if (SarConst.isNegative())
30149 return DAG.getNode(ISD::SHL, DL, VT, NN,
30150 DAG.getConstant(-SarConst, DL, CVT));
30152 return DAG.getNode(ISD::SRA, DL, VT, NN,
30153 DAG.getConstant(SarConst, DL, CVT));
30158 /// \brief Returns a vector of 0s if the node in input is a vector logical
30159 /// shift by a constant amount which is known to be bigger than or equal
30160 /// to the vector element size in bits.
30161 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
30162 const X86Subtarget &Subtarget) {
30163 EVT VT = N->getValueType(0);
30165 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
30166 (!Subtarget.hasInt256() ||
30167 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
30170 SDValue Amt = N->getOperand(1);
30172 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
30173 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
30174 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
30175 unsigned MaxAmount =
30176 VT.getSimpleVT().getScalarSizeInBits();
30178 // SSE2/AVX2 logical shifts always return a vector of 0s
30179 // if the shift amount is bigger than or equal to
30180 // the element size. The constant shift amount will be
30181 // encoded as a 8-bit immediate.
30182 if (ShiftAmt.trunc(8).uge(MaxAmount))
30183 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
30189 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
30190 TargetLowering::DAGCombinerInfo &DCI,
30191 const X86Subtarget &Subtarget) {
30192 if (N->getOpcode() == ISD::SHL)
30193 if (SDValue V = combineShiftLeft(N, DAG))
30196 if (N->getOpcode() == ISD::SRA)
30197 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
30200 // Try to fold this logical shift into a zero vector.
30201 if (N->getOpcode() != ISD::SRA)
30202 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
30208 static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
30209 TargetLowering::DAGCombinerInfo &DCI,
30210 const X86Subtarget &Subtarget) {
30211 assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
30212 "Unexpected opcode");
30213 EVT VT = N->getValueType(0);
30214 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
30216 // This fails for mask register (vXi1) shifts.
30217 if ((NumBitsPerElt % 8) != 0)
30220 // Out of range logical bit shifts are guaranteed to be zero.
30221 APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
30222 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
30223 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30225 // Shift N0 by zero -> N0.
30227 return N->getOperand(0);
30229 // Shift zero -> zero.
30230 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
30231 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30233 // We can decode 'whole byte' logical bit shifts as shuffles.
30234 if ((ShiftVal.getZExtValue() % 8) == 0) {
30236 SmallVector<int, 1> NonceMask; // Just a placeholder.
30237 NonceMask.push_back(0);
30238 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30239 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30241 return SDValue(); // This routine will use CombineTo to replace N.
30247 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
30248 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
30249 /// OR -> CMPNEQSS.
30250 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
30251 TargetLowering::DAGCombinerInfo &DCI,
30252 const X86Subtarget &Subtarget) {
30255 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
30256 // we're requiring SSE2 for both.
30257 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
30258 SDValue N0 = N->getOperand(0);
30259 SDValue N1 = N->getOperand(1);
30260 SDValue CMP0 = N0->getOperand(1);
30261 SDValue CMP1 = N1->getOperand(1);
30264 // The SETCCs should both refer to the same CMP.
30265 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
30268 SDValue CMP00 = CMP0->getOperand(0);
30269 SDValue CMP01 = CMP0->getOperand(1);
30270 EVT VT = CMP00.getValueType();
30272 if (VT == MVT::f32 || VT == MVT::f64) {
30273 bool ExpectingFlags = false;
30274 // Check for any users that want flags:
30275 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
30276 !ExpectingFlags && UI != UE; ++UI)
30277 switch (UI->getOpcode()) {
30282 ExpectingFlags = true;
30284 case ISD::CopyToReg:
30285 case ISD::SIGN_EXTEND:
30286 case ISD::ZERO_EXTEND:
30287 case ISD::ANY_EXTEND:
30291 if (!ExpectingFlags) {
30292 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
30293 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
30295 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
30296 X86::CondCode tmp = cc0;
30301 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
30302 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
30303 // FIXME: need symbolic constants for these magic numbers.
30304 // See X86ATTInstPrinter.cpp:printSSECC().
30305 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
30306 if (Subtarget.hasAVX512()) {
30307 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
30309 DAG.getConstant(x86cc, DL, MVT::i8));
30310 if (N->getValueType(0) != MVT::i1)
30311 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
30315 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
30316 CMP00.getValueType(), CMP00, CMP01,
30317 DAG.getConstant(x86cc, DL,
30320 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
30321 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
30323 if (is64BitFP && !Subtarget.is64Bit()) {
30324 // On a 32-bit target, we cannot bitcast the 64-bit float to a
30325 // 64-bit integer, since that's not a legal type. Since
30326 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
30327 // bits, but can do this little dance to extract the lowest 32 bits
30328 // and work with those going forward.
30329 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
30331 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
30332 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
30333 Vector32, DAG.getIntPtrConstant(0, DL));
30337 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
30338 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
30339 DAG.getConstant(1, DL, IntVT));
30340 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
30342 return OneBitOfTruth;
30350 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
30351 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
30352 assert(N->getOpcode() == ISD::AND);
30354 EVT VT = N->getValueType(0);
30355 SDValue N0 = N->getOperand(0);
30356 SDValue N1 = N->getOperand(1);
30359 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
30362 // Canonicalize XOR to the left.
30363 if (N1.getOpcode() == ISD::XOR)
30366 if (N0.getOpcode() != ISD::XOR)
30369 SDValue N00 = N0->getOperand(0);
30370 SDValue N01 = N0->getOperand(1);
30372 N01 = peekThroughBitcasts(N01);
30374 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
30375 // insert_subvector building a 256-bit AllOnes vector.
30376 if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
30377 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
30380 SDValue V1 = N01->getOperand(0);
30381 SDValue V2 = N01->getOperand(1);
30382 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
30383 !V1.getOperand(0).isUndef() ||
30384 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
30385 !ISD::isBuildVectorAllOnes(V2.getNode()))
30388 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
30391 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
30392 // register. In most cases we actually compare or select YMM-sized registers
30393 // and mixing the two types creates horrible code. This method optimizes
30394 // some of the transition sequences.
30395 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
30396 TargetLowering::DAGCombinerInfo &DCI,
30397 const X86Subtarget &Subtarget) {
30398 EVT VT = N->getValueType(0);
30399 if (!VT.is256BitVector())
30402 assert((N->getOpcode() == ISD::ANY_EXTEND ||
30403 N->getOpcode() == ISD::ZERO_EXTEND ||
30404 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
30406 SDValue Narrow = N->getOperand(0);
30407 EVT NarrowVT = Narrow->getValueType(0);
30408 if (!NarrowVT.is128BitVector())
30411 if (Narrow->getOpcode() != ISD::XOR &&
30412 Narrow->getOpcode() != ISD::AND &&
30413 Narrow->getOpcode() != ISD::OR)
30416 SDValue N0 = Narrow->getOperand(0);
30417 SDValue N1 = Narrow->getOperand(1);
30420 // The Left side has to be a trunc.
30421 if (N0.getOpcode() != ISD::TRUNCATE)
30424 // The type of the truncated inputs.
30425 EVT WideVT = N0->getOperand(0)->getValueType(0);
30429 // The right side has to be a 'trunc' or a constant vector.
30430 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
30431 ConstantSDNode *RHSConstSplat = nullptr;
30432 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
30433 RHSConstSplat = RHSBV->getConstantSplatNode();
30434 if (!RHSTrunc && !RHSConstSplat)
30437 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30439 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
30442 // Set N0 and N1 to hold the inputs to the new wide operation.
30443 N0 = N0->getOperand(0);
30444 if (RHSConstSplat) {
30445 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
30446 SDValue(RHSConstSplat, 0));
30447 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
30448 } else if (RHSTrunc) {
30449 N1 = N1->getOperand(0);
30452 // Generate the wide operation.
30453 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
30454 unsigned Opcode = N->getOpcode();
30456 case ISD::ANY_EXTEND:
30458 case ISD::ZERO_EXTEND: {
30459 unsigned InBits = NarrowVT.getScalarSizeInBits();
30460 APInt Mask = APInt::getAllOnesValue(InBits);
30461 Mask = Mask.zext(VT.getScalarSizeInBits());
30462 return DAG.getNode(ISD::AND, DL, VT,
30463 Op, DAG.getConstant(Mask, DL, VT));
30465 case ISD::SIGN_EXTEND:
30466 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
30467 Op, DAG.getValueType(NarrowVT));
30469 llvm_unreachable("Unexpected opcode");
30473 /// If both input operands of a logic op are being cast from floating point
30474 /// types, try to convert this into a floating point logic node to avoid
30475 /// unnecessary moves from SSE to integer registers.
30476 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
30477 const X86Subtarget &Subtarget) {
30478 unsigned FPOpcode = ISD::DELETED_NODE;
30479 if (N->getOpcode() == ISD::AND)
30480 FPOpcode = X86ISD::FAND;
30481 else if (N->getOpcode() == ISD::OR)
30482 FPOpcode = X86ISD::FOR;
30483 else if (N->getOpcode() == ISD::XOR)
30484 FPOpcode = X86ISD::FXOR;
30486 assert(FPOpcode != ISD::DELETED_NODE &&
30487 "Unexpected input node for FP logic conversion");
30489 EVT VT = N->getValueType(0);
30490 SDValue N0 = N->getOperand(0);
30491 SDValue N1 = N->getOperand(1);
30493 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
30494 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
30495 (Subtarget.hasSSE2() && VT == MVT::i64))) {
30496 SDValue N00 = N0.getOperand(0);
30497 SDValue N10 = N1.getOperand(0);
30498 EVT N00Type = N00.getValueType();
30499 EVT N10Type = N10.getValueType();
30500 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
30501 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
30502 return DAG.getBitcast(VT, FPLogic);
30508 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
30509 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
30510 /// eliminate loading the vector constant mask value. This relies on the fact
30511 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
30512 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
30513 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
30514 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
30516 // TODO: Use AssertSext to mark any nodes that have the property of producing
30517 // all-ones or all-zeros. Then check for that node rather than particular
30519 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
30522 // The existence of the PCMP node guarantees that we have the required SSE2 or
30523 // AVX2 for a shift of this vector type, but there is no vector shift by
30524 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
30525 // masked compare nodes, so they should not make it here.
30526 EVT VT0 = Op0.getValueType();
30527 EVT VT1 = Op1.getValueType();
30528 unsigned EltBitWidth = VT0.getScalarSizeInBits();
30529 if (VT0 != VT1 || EltBitWidth == 8)
30532 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
30535 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
30539 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
30540 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
30541 return DAG.getBitcast(N->getValueType(0), Shift);
30544 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
30545 TargetLowering::DAGCombinerInfo &DCI,
30546 const X86Subtarget &Subtarget) {
30547 if (DCI.isBeforeLegalizeOps())
30550 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30553 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30556 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
30559 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
30562 EVT VT = N->getValueType(0);
30563 SDValue N0 = N->getOperand(0);
30564 SDValue N1 = N->getOperand(1);
30567 // Attempt to recursively combine a bitmask AND with shuffles.
30568 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
30570 SmallVector<int, 1> NonceMask; // Just a placeholder.
30571 NonceMask.push_back(0);
30572 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30573 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30575 return SDValue(); // This routine will use CombineTo to replace N.
30578 // Create BEXTR instructions
30579 // BEXTR is ((X >> imm) & (2**size-1))
30580 if (VT != MVT::i32 && VT != MVT::i64)
30583 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
30585 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
30588 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
30589 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
30590 if (MaskNode && ShiftNode) {
30591 uint64_t Mask = MaskNode->getZExtValue();
30592 uint64_t Shift = ShiftNode->getZExtValue();
30593 if (isMask_64(Mask)) {
30594 uint64_t MaskSize = countPopulation(Mask);
30595 if (Shift + MaskSize <= VT.getSizeInBits())
30596 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
30597 DAG.getConstant(Shift | (MaskSize << 8), DL,
30605 // (or (and (m, y), (pandn m, x)))
30607 // (vselect m, x, y)
30608 // As a special case, try to fold:
30609 // (or (and (m, (sub 0, x)), (pandn m, x)))
30611 // (sub (xor X, M), M)
30612 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
30613 const X86Subtarget &Subtarget) {
30614 assert(N->getOpcode() == ISD::OR);
30616 SDValue N0 = N->getOperand(0);
30617 SDValue N1 = N->getOperand(1);
30618 EVT VT = N->getValueType(0);
30620 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
30622 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
30624 // Canonicalize pandn to RHS
30625 if (N0.getOpcode() == X86ISD::ANDNP)
30628 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
30631 SDValue Mask = N1.getOperand(0);
30632 SDValue X = N1.getOperand(1);
30634 if (N0.getOperand(0) == Mask)
30635 Y = N0.getOperand(1);
30636 if (N0.getOperand(1) == Mask)
30637 Y = N0.getOperand(0);
30639 // Check to see if the mask appeared in both the AND and ANDNP.
30643 // Validate that X, Y, and Mask are bitcasts, and see through them.
30644 Mask = peekThroughBitcasts(Mask);
30645 X = peekThroughBitcasts(X);
30646 Y = peekThroughBitcasts(Y);
30648 EVT MaskVT = Mask.getValueType();
30650 // Validate that the Mask operand is a vector sra node.
30651 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
30652 // there is no psrai.b
30653 unsigned EltBits = MaskVT.getScalarSizeInBits();
30654 unsigned SraAmt = ~0;
30655 if (Mask.getOpcode() == ISD::SRA) {
30656 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
30657 if (auto *AmtConst = AmtBV->getConstantSplatNode())
30658 SraAmt = AmtConst->getZExtValue();
30659 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
30660 SDValue SraC = Mask.getOperand(1);
30661 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
30663 if ((SraAmt + 1) != EltBits)
30669 // (or (and (M, (sub 0, X)), (pandn M, X)))
30670 // which is a special case of vselect:
30671 // (vselect M, (sub 0, X), X)
30673 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
30674 // We know that, if fNegate is 0 or 1:
30675 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
30677 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
30678 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
30679 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
30680 // This lets us transform our vselect to:
30681 // (add (xor X, M), (and M, 1))
30683 // (sub (xor X, M), M)
30684 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
30685 auto IsNegV = [](SDNode *N, SDValue V) {
30686 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
30687 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
30690 if (IsNegV(Y.getNode(), X))
30692 else if (IsNegV(X.getNode(), Y))
30696 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
30697 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
30698 SDValue SubOp2 = Mask;
30700 // If the negate was on the false side of the select, then
30701 // the operands of the SUB need to be swapped. PR 27251.
30702 // This is because the pattern being matched above is
30703 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
30704 // but if the pattern matched was
30705 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
30706 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
30707 // pattern also needs to be a negation of the replacement pattern above.
30708 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
30709 // sub accomplishes the negation of the replacement pattern.
30711 std::swap(SubOp1, SubOp2);
30713 return DAG.getBitcast(VT,
30714 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
30718 // PBLENDVB is only available on SSE 4.1.
30719 if (!Subtarget.hasSSE41())
30722 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
30724 X = DAG.getBitcast(BlendVT, X);
30725 Y = DAG.getBitcast(BlendVT, Y);
30726 Mask = DAG.getBitcast(BlendVT, Mask);
30727 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
30728 return DAG.getBitcast(VT, Mask);
30731 // Helper function for combineOrCmpEqZeroToCtlzSrl
30735 // srl(ctlz x), log2(bitsize(x))
30736 // Input pattern is checked by caller.
30737 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
30738 SelectionDAG &DAG) {
30739 SDValue Cmp = Op.getOperand(1);
30740 EVT VT = Cmp.getOperand(0).getValueType();
30741 unsigned Log2b = Log2_32(VT.getSizeInBits());
30743 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
30744 // The result of the shift is true or false, and on X86, the 32-bit
30745 // encoding of shr and lzcnt is more desirable.
30746 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
30747 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
30748 DAG.getConstant(Log2b, dl, VT));
30749 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
30752 // Try to transform:
30753 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
30755 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
30756 // Will also attempt to match more generic cases, eg:
30757 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
30758 // Only applies if the target supports the FastLZCNT feature.
30759 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
30760 TargetLowering::DAGCombinerInfo &DCI,
30761 const X86Subtarget &Subtarget) {
30762 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
30765 auto isORCandidate = [](SDValue N) {
30766 return (N->getOpcode() == ISD::OR && N->hasOneUse());
30769 // Check the zero extend is extending to 32-bit or more. The code generated by
30770 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
30771 // instructions to clear the upper bits.
30772 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
30773 !isORCandidate(N->getOperand(0)))
30776 // Check the node matches: setcc(eq, cmp 0)
30777 auto isSetCCCandidate = [](SDValue N) {
30778 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
30779 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
30780 N->getOperand(1).getOpcode() == X86ISD::CMP &&
30781 N->getOperand(1).getConstantOperandVal(1) == 0 &&
30782 N->getOperand(1).getValueType().bitsGE(MVT::i32);
30785 SDNode *OR = N->getOperand(0).getNode();
30786 SDValue LHS = OR->getOperand(0);
30787 SDValue RHS = OR->getOperand(1);
30789 // Save nodes matching or(or, setcc(eq, cmp 0)).
30790 SmallVector<SDNode *, 2> ORNodes;
30791 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
30792 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
30793 ORNodes.push_back(OR);
30794 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
30795 LHS = OR->getOperand(0);
30796 RHS = OR->getOperand(1);
30799 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
30800 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
30801 !isORCandidate(SDValue(OR, 0)))
30804 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
30806 // or(srl(ctlz),srl(ctlz)).
30807 // The dag combiner can then fold it into:
30808 // srl(or(ctlz, ctlz)).
30809 EVT VT = OR->getValueType(0);
30810 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
30811 SDValue Ret, NewRHS;
30812 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
30813 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
30818 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
30819 while (ORNodes.size() > 0) {
30820 OR = ORNodes.pop_back_val();
30821 LHS = OR->getOperand(0);
30822 RHS = OR->getOperand(1);
30823 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
30824 if (RHS->getOpcode() == ISD::OR)
30825 std::swap(LHS, RHS);
30826 EVT VT = OR->getValueType(0);
30827 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
30830 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
30834 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
30839 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
30840 TargetLowering::DAGCombinerInfo &DCI,
30841 const X86Subtarget &Subtarget) {
30842 if (DCI.isBeforeLegalizeOps())
30845 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30848 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30851 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
30854 SDValue N0 = N->getOperand(0);
30855 SDValue N1 = N->getOperand(1);
30856 EVT VT = N->getValueType(0);
30858 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
30861 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
30862 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
30864 // SHLD/SHRD instructions have lower register pressure, but on some
30865 // platforms they have higher latency than the equivalent
30866 // series of shifts/or that would otherwise be generated.
30867 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
30868 // have higher latencies and we are not optimizing for size.
30869 if (!OptForSize && Subtarget.isSHLDSlow())
30872 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
30874 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
30876 if (!N0.hasOneUse() || !N1.hasOneUse())
30879 SDValue ShAmt0 = N0.getOperand(1);
30880 if (ShAmt0.getValueType() != MVT::i8)
30882 SDValue ShAmt1 = N1.getOperand(1);
30883 if (ShAmt1.getValueType() != MVT::i8)
30885 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
30886 ShAmt0 = ShAmt0.getOperand(0);
30887 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
30888 ShAmt1 = ShAmt1.getOperand(0);
30891 unsigned Opc = X86ISD::SHLD;
30892 SDValue Op0 = N0.getOperand(0);
30893 SDValue Op1 = N1.getOperand(0);
30894 if (ShAmt0.getOpcode() == ISD::SUB ||
30895 ShAmt0.getOpcode() == ISD::XOR) {
30896 Opc = X86ISD::SHRD;
30897 std::swap(Op0, Op1);
30898 std::swap(ShAmt0, ShAmt1);
30901 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
30902 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
30903 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
30904 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
30905 unsigned Bits = VT.getSizeInBits();
30906 if (ShAmt1.getOpcode() == ISD::SUB) {
30907 SDValue Sum = ShAmt1.getOperand(0);
30908 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
30909 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
30910 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
30911 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
30912 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
30913 return DAG.getNode(Opc, DL, VT,
30915 DAG.getNode(ISD::TRUNCATE, DL,
30918 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
30919 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
30920 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
30921 return DAG.getNode(Opc, DL, VT,
30922 N0.getOperand(0), N1.getOperand(0),
30923 DAG.getNode(ISD::TRUNCATE, DL,
30925 } else if (ShAmt1.getOpcode() == ISD::XOR) {
30926 SDValue Mask = ShAmt1.getOperand(1);
30927 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
30928 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
30929 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
30930 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
30931 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
30932 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
30933 if (Op1.getOpcode() == InnerShift &&
30934 isa<ConstantSDNode>(Op1.getOperand(1)) &&
30935 Op1.getConstantOperandVal(1) == 1) {
30936 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30937 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30939 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
30940 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
30941 Op1.getOperand(0) == Op1.getOperand(1)) {
30942 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30943 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30952 /// Generate NEG and CMOV for integer abs.
30953 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
30954 EVT VT = N->getValueType(0);
30956 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30957 // 8-bit integer abs to NEG and CMOV.
30958 if (VT.isInteger() && VT.getSizeInBits() == 8)
30961 SDValue N0 = N->getOperand(0);
30962 SDValue N1 = N->getOperand(1);
30965 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
30966 // and change it to SUB and CMOV.
30967 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
30968 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
30969 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
30970 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
30971 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
30972 // Generate SUB & CMOV.
30973 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30974 DAG.getConstant(0, DL, VT), N0.getOperand(0));
30975 SDValue Ops[] = {N0.getOperand(0), Neg,
30976 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
30977 SDValue(Neg.getNode(), 1)};
30978 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
30984 /// Try to turn tests against the signbit in the form of:
30985 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
30988 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
30989 // This is only worth doing if the output type is i8 or i1.
30990 EVT ResultType = N->getValueType(0);
30991 if (ResultType != MVT::i8 && ResultType != MVT::i1)
30994 SDValue N0 = N->getOperand(0);
30995 SDValue N1 = N->getOperand(1);
30997 // We should be performing an xor against a truncated shift.
30998 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
31001 // Make sure we are performing an xor against one.
31002 if (!isOneConstant(N1))
31005 // SetCC on x86 zero extends so only act on this if it's a logical shift.
31006 SDValue Shift = N0.getOperand(0);
31007 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
31010 // Make sure we are truncating from one of i16, i32 or i64.
31011 EVT ShiftTy = Shift.getValueType();
31012 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
31015 // Make sure the shift amount extracts the sign bit.
31016 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
31017 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
31020 // Create a greater-than comparison against -1.
31021 // N.B. Using SETGE against 0 works but we want a canonical looking
31022 // comparison, using SETGT matches up with what TranslateX86CC.
31024 SDValue ShiftOp = Shift.getOperand(0);
31025 EVT ShiftOpTy = ShiftOp.getValueType();
31026 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31027 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
31028 *DAG.getContext(), ResultType);
31029 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
31030 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
31031 if (SetCCResultType != ResultType)
31032 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
31036 /// Turn vector tests of the signbit in the form of:
31037 /// xor (sra X, elt_size(X)-1), -1
31041 /// This should be called before type legalization because the pattern may not
31042 /// persist after that.
31043 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
31044 const X86Subtarget &Subtarget) {
31045 EVT VT = N->getValueType(0);
31046 if (!VT.isSimple())
31049 switch (VT.getSimpleVT().SimpleTy) {
31050 default: return SDValue();
31053 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
31054 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
31058 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
31061 // There must be a shift right algebraic before the xor, and the xor must be a
31062 // 'not' operation.
31063 SDValue Shift = N->getOperand(0);
31064 SDValue Ones = N->getOperand(1);
31065 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
31066 !ISD::isBuildVectorAllOnes(Ones.getNode()))
31069 // The shift should be smearing the sign bit across each vector element.
31070 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
31074 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
31075 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
31076 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
31079 // Create a greater-than comparison against -1. We don't use the more obvious
31080 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
31081 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
31084 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
31085 /// is valid for the given \p Subtarget.
31087 isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
31088 if (!Subtarget.hasAVX512())
31090 EVT SrcElVT = SrcVT.getScalarType();
31091 EVT DstElVT = DstVT.getScalarType();
31092 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
31094 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
31096 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
31097 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
31101 /// Detect a pattern of truncation with saturation:
31102 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
31103 /// Return the source value to be truncated or SDValue() if the pattern was not
31104 /// matched or the unsupported on the current target.
31106 detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
31107 if (In.getOpcode() != ISD::UMIN)
31110 EVT InVT = In.getValueType();
31111 // FIXME: Scalar type may be supported if we move it to vector register.
31112 if (!InVT.isVector() || !InVT.isSimple())
31115 if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
31118 //Saturation with truncation. We truncate from InVT to VT.
31119 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
31120 "Unexpected types for truncate operation");
31124 if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
31125 SrcVal = In.getOperand(1);
31126 else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
31127 SrcVal = In.getOperand(0);
31131 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
31132 // the element size of the destination type.
31133 return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
31134 SrcVal : SDValue();
31137 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
31138 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
31139 /// X86ISD::AVG instruction.
31140 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
31141 const X86Subtarget &Subtarget,
31143 if (!VT.isVector() || !VT.isSimple())
31145 EVT InVT = In.getValueType();
31146 unsigned NumElems = VT.getVectorNumElements();
31148 EVT ScalarVT = VT.getVectorElementType();
31149 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
31150 isPowerOf2_32(NumElems)))
31153 // InScalarVT is the intermediate type in AVG pattern and it should be greater
31154 // than the original input type (i8/i16).
31155 EVT InScalarVT = InVT.getVectorElementType();
31156 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
31159 if (!Subtarget.hasSSE2())
31161 if (Subtarget.hasBWI()) {
31162 if (VT.getSizeInBits() > 512)
31164 } else if (Subtarget.hasAVX2()) {
31165 if (VT.getSizeInBits() > 256)
31168 if (VT.getSizeInBits() > 128)
31172 // Detect the following pattern:
31174 // %1 = zext <N x i8> %a to <N x i32>
31175 // %2 = zext <N x i8> %b to <N x i32>
31176 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
31177 // %4 = add nuw nsw <N x i32> %3, %2
31178 // %5 = lshr <N x i32> %N, <i32 1 x N>
31179 // %6 = trunc <N x i32> %5 to <N x i8>
31181 // In AVX512, the last instruction can also be a trunc store.
31183 if (In.getOpcode() != ISD::SRL)
31186 // A lambda checking the given SDValue is a constant vector and each element
31187 // is in the range [Min, Max].
31188 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
31189 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
31190 if (!BV || !BV->isConstant())
31192 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
31193 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
31196 uint64_t Val = C->getZExtValue();
31197 if (Val < Min || Val > Max)
31203 // Check if each element of the vector is left-shifted by one.
31204 auto LHS = In.getOperand(0);
31205 auto RHS = In.getOperand(1);
31206 if (!IsConstVectorInRange(RHS, 1, 1))
31208 if (LHS.getOpcode() != ISD::ADD)
31211 // Detect a pattern of a + b + 1 where the order doesn't matter.
31212 SDValue Operands[3];
31213 Operands[0] = LHS.getOperand(0);
31214 Operands[1] = LHS.getOperand(1);
31216 // Take care of the case when one of the operands is a constant vector whose
31217 // element is in the range [1, 256].
31218 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
31219 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
31220 Operands[0].getOperand(0).getValueType() == VT) {
31221 // The pattern is detected. Subtract one from the constant vector, then
31222 // demote it and emit X86ISD::AVG instruction.
31223 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
31224 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
31225 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
31226 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31230 if (Operands[0].getOpcode() == ISD::ADD)
31231 std::swap(Operands[0], Operands[1]);
31232 else if (Operands[1].getOpcode() != ISD::ADD)
31234 Operands[2] = Operands[1].getOperand(0);
31235 Operands[1] = Operands[1].getOperand(1);
31237 // Now we have three operands of two additions. Check that one of them is a
31238 // constant vector with ones, and the other two are promoted from i8/i16.
31239 for (int i = 0; i < 3; ++i) {
31240 if (!IsConstVectorInRange(Operands[i], 1, 1))
31242 std::swap(Operands[i], Operands[2]);
31244 // Check if Operands[0] and Operands[1] are results of type promotion.
31245 for (int j = 0; j < 2; ++j)
31246 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
31247 Operands[j].getOperand(0).getValueType() != VT)
31250 // The pattern is detected, emit X86ISD::AVG instruction.
31251 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31252 Operands[1].getOperand(0));
31258 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
31259 TargetLowering::DAGCombinerInfo &DCI,
31260 const X86Subtarget &Subtarget) {
31261 LoadSDNode *Ld = cast<LoadSDNode>(N);
31262 EVT RegVT = Ld->getValueType(0);
31263 EVT MemVT = Ld->getMemoryVT();
31265 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31267 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
31268 // into two 16-byte operations.
31269 ISD::LoadExtType Ext = Ld->getExtensionType();
31271 unsigned AddressSpace = Ld->getAddressSpace();
31272 unsigned Alignment = Ld->getAlignment();
31273 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
31274 Ext == ISD::NON_EXTLOAD &&
31275 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
31276 AddressSpace, Alignment, &Fast) && !Fast) {
31277 unsigned NumElems = RegVT.getVectorNumElements();
31281 SDValue Ptr = Ld->getBasePtr();
31283 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
31286 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31287 Alignment, Ld->getMemOperand()->getFlags());
31289 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
31291 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31292 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
31293 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31295 Load2.getValue(1));
31297 SDValue NewVec = DAG.getUNDEF(RegVT);
31298 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
31299 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
31300 return DCI.CombineTo(N, NewVec, TF, true);
31306 /// If V is a build vector of boolean constants and exactly one of those
31307 /// constants is true, return the operand index of that true element.
31308 /// Otherwise, return -1.
31309 static int getOneTrueElt(SDValue V) {
31310 // This needs to be a build vector of booleans.
31311 // TODO: Checking for the i1 type matches the IR definition for the mask,
31312 // but the mask check could be loosened to i8 or other types. That might
31313 // also require checking more than 'allOnesValue'; eg, the x86 HW
31314 // instructions only require that the MSB is set for each mask element.
31315 // The ISD::MSTORE comments/definition do not specify how the mask operand
31317 auto *BV = dyn_cast<BuildVectorSDNode>(V);
31318 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
31321 int TrueIndex = -1;
31322 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
31323 for (unsigned i = 0; i < NumElts; ++i) {
31324 const SDValue &Op = BV->getOperand(i);
31327 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
31330 if (ConstNode->getAPIntValue().isAllOnesValue()) {
31331 // If we already found a one, this is too many.
31332 if (TrueIndex >= 0)
31340 /// Given a masked memory load/store operation, return true if it has one mask
31341 /// bit set. If it has one mask bit set, then also return the memory address of
31342 /// the scalar element to load/store, the vector index to insert/extract that
31343 /// scalar element, and the alignment for the scalar memory access.
31344 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
31345 SelectionDAG &DAG, SDValue &Addr,
31346 SDValue &Index, unsigned &Alignment) {
31347 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
31348 if (TrueMaskElt < 0)
31351 // Get the address of the one scalar element that is specified by the mask
31352 // using the appropriate offset from the base pointer.
31353 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
31354 Addr = MaskedOp->getBasePtr();
31355 if (TrueMaskElt != 0) {
31356 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
31357 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
31360 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
31361 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
31365 /// If exactly one element of the mask is set for a non-extending masked load,
31366 /// it is a scalar load and vector insert.
31367 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31368 /// mask have already been optimized in IR, so we don't bother with those here.
31370 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31371 TargetLowering::DAGCombinerInfo &DCI) {
31372 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31373 // However, some target hooks may need to be added to know when the transform
31374 // is profitable. Endianness would also have to be considered.
31376 SDValue Addr, VecIndex;
31377 unsigned Alignment;
31378 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
31381 // Load the one scalar element that is specified by the mask using the
31382 // appropriate offset from the base pointer.
31384 EVT VT = ML->getValueType(0);
31385 EVT EltVT = VT.getVectorElementType();
31387 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
31388 Alignment, ML->getMemOperand()->getFlags());
31390 // Insert the loaded element into the appropriate place in the vector.
31391 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
31393 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
31397 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31398 TargetLowering::DAGCombinerInfo &DCI) {
31399 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
31403 EVT VT = ML->getValueType(0);
31405 // If we are loading the first and last elements of a vector, it is safe and
31406 // always faster to load the whole vector. Replace the masked load with a
31407 // vector load and select.
31408 unsigned NumElts = VT.getVectorNumElements();
31409 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
31410 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
31411 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
31412 if (LoadFirstElt && LoadLastElt) {
31413 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31414 ML->getMemOperand());
31415 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
31416 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
31419 // Convert a masked load with a constant mask into a masked load and a select.
31420 // This allows the select operation to use a faster kind of select instruction
31421 // (for example, vblendvps -> vblendps).
31423 // Don't try this if the pass-through operand is already undefined. That would
31424 // cause an infinite loop because that's what we're about to create.
31425 if (ML->getSrc0().isUndef())
31428 // The new masked load has an undef pass-through operand. The select uses the
31429 // original pass-through operand.
31430 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31431 ML->getMask(), DAG.getUNDEF(VT),
31432 ML->getMemoryVT(), ML->getMemOperand(),
31433 ML->getExtensionType());
31434 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
31436 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
31439 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
31440 TargetLowering::DAGCombinerInfo &DCI,
31441 const X86Subtarget &Subtarget) {
31442 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
31444 // TODO: Expanding load with constant mask may be optimized as well.
31445 if (Mld->isExpandingLoad())
31448 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
31449 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
31451 // TODO: Do some AVX512 subsets benefit from this transform?
31452 if (!Subtarget.hasAVX512())
31453 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
31457 if (Mld->getExtensionType() != ISD::SEXTLOAD)
31460 // Resolve extending loads.
31461 EVT VT = Mld->getValueType(0);
31462 unsigned NumElems = VT.getVectorNumElements();
31463 EVT LdVT = Mld->getMemoryVT();
31466 assert(LdVT != VT && "Cannot extend to the same type");
31467 unsigned ToSz = VT.getScalarSizeInBits();
31468 unsigned FromSz = LdVT.getScalarSizeInBits();
31469 // From/To sizes and ElemCount must be pow of two.
31470 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31471 "Unexpected size for extending masked load");
31473 unsigned SizeRatio = ToSz / FromSz;
31474 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
31476 // Create a type on which we perform the shuffle.
31477 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31478 LdVT.getScalarType(), NumElems*SizeRatio);
31479 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31481 // Convert Src0 value.
31482 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
31483 if (!Mld->getSrc0().isUndef()) {
31484 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31485 for (unsigned i = 0; i != NumElems; ++i)
31486 ShuffleVec[i] = i * SizeRatio;
31488 // Can't shuffle using an illegal type.
31489 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31490 "WideVecVT should be legal");
31491 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
31492 DAG.getUNDEF(WideVecVT), ShuffleVec);
31494 // Prepare the new mask.
31496 SDValue Mask = Mld->getMask();
31497 if (Mask.getValueType() == VT) {
31498 // Mask and original value have the same type.
31499 NewMask = DAG.getBitcast(WideVecVT, Mask);
31500 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31501 for (unsigned i = 0; i != NumElems; ++i)
31502 ShuffleVec[i] = i * SizeRatio;
31503 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
31504 ShuffleVec[i] = NumElems * SizeRatio;
31505 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31506 DAG.getConstant(0, dl, WideVecVT),
31509 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31510 unsigned WidenNumElts = NumElems*SizeRatio;
31511 unsigned MaskNumElts = VT.getVectorNumElements();
31512 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31515 unsigned NumConcat = WidenNumElts / MaskNumElts;
31516 SmallVector<SDValue, 16> Ops(NumConcat);
31517 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31519 for (unsigned i = 1; i != NumConcat; ++i)
31522 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31525 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
31526 Mld->getBasePtr(), NewMask, WideSrc0,
31527 Mld->getMemoryVT(), Mld->getMemOperand(),
31529 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
31530 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
31533 /// If exactly one element of the mask is set for a non-truncating masked store,
31534 /// it is a vector extract and scalar store.
31535 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31536 /// mask have already been optimized in IR, so we don't bother with those here.
31537 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
31538 SelectionDAG &DAG) {
31539 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31540 // However, some target hooks may need to be added to know when the transform
31541 // is profitable. Endianness would also have to be considered.
31543 SDValue Addr, VecIndex;
31544 unsigned Alignment;
31545 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
31548 // Extract the one scalar element that is actually being stored.
31550 EVT VT = MS->getValue().getValueType();
31551 EVT EltVT = VT.getVectorElementType();
31552 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
31553 MS->getValue(), VecIndex);
31555 // Store that element at the appropriate offset from the base pointer.
31556 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
31557 Alignment, MS->getMemOperand()->getFlags());
31560 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
31561 const X86Subtarget &Subtarget) {
31562 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
31564 if (Mst->isCompressingStore())
31567 if (!Mst->isTruncatingStore())
31568 return reduceMaskedStoreToScalarStore(Mst, DAG);
31570 // Resolve truncating stores.
31571 EVT VT = Mst->getValue().getValueType();
31572 unsigned NumElems = VT.getVectorNumElements();
31573 EVT StVT = Mst->getMemoryVT();
31576 assert(StVT != VT && "Cannot truncate to the same type");
31577 unsigned FromSz = VT.getScalarSizeInBits();
31578 unsigned ToSz = StVT.getScalarSizeInBits();
31580 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31582 // The truncating store is legal in some cases. For example
31583 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31584 // are designated for truncate store.
31585 // In this case we don't need any further transformations.
31586 if (TLI.isTruncStoreLegal(VT, StVT))
31589 // From/To sizes and ElemCount must be pow of two.
31590 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31591 "Unexpected size for truncating masked store");
31592 // We are going to use the original vector elt for storing.
31593 // Accumulated smaller vector elements must be a multiple of the store size.
31594 assert (((NumElems * FromSz) % ToSz) == 0 &&
31595 "Unexpected ratio for truncating masked store");
31597 unsigned SizeRatio = FromSz / ToSz;
31598 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31600 // Create a type on which we perform the shuffle.
31601 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31602 StVT.getScalarType(), NumElems*SizeRatio);
31604 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31606 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
31607 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31608 for (unsigned i = 0; i != NumElems; ++i)
31609 ShuffleVec[i] = i * SizeRatio;
31611 // Can't shuffle using an illegal type.
31612 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31613 "WideVecVT should be legal");
31615 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31616 DAG.getUNDEF(WideVecVT),
31620 SDValue Mask = Mst->getMask();
31621 if (Mask.getValueType() == VT) {
31622 // Mask and original value have the same type.
31623 NewMask = DAG.getBitcast(WideVecVT, Mask);
31624 for (unsigned i = 0; i != NumElems; ++i)
31625 ShuffleVec[i] = i * SizeRatio;
31626 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
31627 ShuffleVec[i] = NumElems*SizeRatio;
31628 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31629 DAG.getConstant(0, dl, WideVecVT),
31632 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31633 unsigned WidenNumElts = NumElems*SizeRatio;
31634 unsigned MaskNumElts = VT.getVectorNumElements();
31635 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31638 unsigned NumConcat = WidenNumElts / MaskNumElts;
31639 SmallVector<SDValue, 16> Ops(NumConcat);
31640 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31642 for (unsigned i = 1; i != NumConcat; ++i)
31645 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31648 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
31649 Mst->getBasePtr(), NewMask, StVT,
31650 Mst->getMemOperand(), false);
31653 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
31654 const X86Subtarget &Subtarget) {
31655 StoreSDNode *St = cast<StoreSDNode>(N);
31656 EVT VT = St->getValue().getValueType();
31657 EVT StVT = St->getMemoryVT();
31659 SDValue StoredVal = St->getOperand(1);
31660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31662 // If we are saving a concatenation of two XMM registers and 32-byte stores
31663 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
31665 unsigned AddressSpace = St->getAddressSpace();
31666 unsigned Alignment = St->getAlignment();
31667 if (VT.is256BitVector() && StVT == VT &&
31668 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
31669 AddressSpace, Alignment, &Fast) &&
31671 unsigned NumElems = VT.getVectorNumElements();
31675 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
31676 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
31678 SDValue Ptr0 = St->getBasePtr();
31679 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
31682 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
31683 Alignment, St->getMemOperand()->getFlags());
31685 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
31686 std::min(16U, Alignment), St->getMemOperand()->getFlags());
31687 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
31690 // Optimize trunc store (of multiple scalars) to shuffle and store.
31691 // First, pack all of the elements in one place. Next, store to memory
31692 // in fewer chunks.
31693 if (St->isTruncatingStore() && VT.isVector()) {
31694 // Check if we can detect an AVG pattern from the truncation. If yes,
31695 // replace the trunc store by a normal store with the result of X86ISD::AVG
31697 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
31699 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
31700 St->getPointerInfo(), St->getAlignment(),
31701 St->getMemOperand()->getFlags());
31704 detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
31705 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
31706 dl, Val, St->getBasePtr(),
31707 St->getMemoryVT(), St->getMemOperand(), DAG);
31709 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31710 unsigned NumElems = VT.getVectorNumElements();
31711 assert(StVT != VT && "Cannot truncate to the same type");
31712 unsigned FromSz = VT.getScalarSizeInBits();
31713 unsigned ToSz = StVT.getScalarSizeInBits();
31715 // The truncating store is legal in some cases. For example
31716 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31717 // are designated for truncate store.
31718 // In this case we don't need any further transformations.
31719 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
31722 // From, To sizes and ElemCount must be pow of two
31723 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
31724 // We are going to use the original vector elt for storing.
31725 // Accumulated smaller vector elements must be a multiple of the store size.
31726 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
31728 unsigned SizeRatio = FromSz / ToSz;
31730 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31732 // Create a type on which we perform the shuffle
31733 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31734 StVT.getScalarType(), NumElems*SizeRatio);
31736 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31738 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
31739 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
31740 for (unsigned i = 0; i != NumElems; ++i)
31741 ShuffleVec[i] = i * SizeRatio;
31743 // Can't shuffle using an illegal type.
31744 if (!TLI.isTypeLegal(WideVecVT))
31747 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31748 DAG.getUNDEF(WideVecVT),
31750 // At this point all of the data is stored at the bottom of the
31751 // register. We now need to save it to mem.
31753 // Find the largest store unit
31754 MVT StoreType = MVT::i8;
31755 for (MVT Tp : MVT::integer_valuetypes()) {
31756 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
31760 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
31761 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
31762 (64 <= NumElems * ToSz))
31763 StoreType = MVT::f64;
31765 // Bitcast the original vector into a vector of store-size units
31766 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
31767 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
31768 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
31769 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
31770 SmallVector<SDValue, 8> Chains;
31771 SDValue Ptr = St->getBasePtr();
31773 // Perform one or more big stores into memory.
31774 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
31775 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
31776 StoreType, ShuffWide,
31777 DAG.getIntPtrConstant(i, dl));
31779 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
31780 St->getAlignment(), St->getMemOperand()->getFlags());
31781 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
31782 Chains.push_back(Ch);
31785 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
31788 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
31789 // the FP state in cases where an emms may be missing.
31790 // A preferable solution to the general problem is to figure out the right
31791 // places to insert EMMS. This qualifies as a quick hack.
31793 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
31794 if (VT.getSizeInBits() != 64)
31797 const Function *F = DAG.getMachineFunction().getFunction();
31798 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
31800 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
31801 if ((VT.isVector() ||
31802 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
31803 isa<LoadSDNode>(St->getValue()) &&
31804 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
31805 St->getChain().hasOneUse() && !St->isVolatile()) {
31806 SDNode* LdVal = St->getValue().getNode();
31807 LoadSDNode *Ld = nullptr;
31808 int TokenFactorIndex = -1;
31809 SmallVector<SDValue, 8> Ops;
31810 SDNode* ChainVal = St->getChain().getNode();
31811 // Must be a store of a load. We currently handle two cases: the load
31812 // is a direct child, and it's under an intervening TokenFactor. It is
31813 // possible to dig deeper under nested TokenFactors.
31814 if (ChainVal == LdVal)
31815 Ld = cast<LoadSDNode>(St->getChain());
31816 else if (St->getValue().hasOneUse() &&
31817 ChainVal->getOpcode() == ISD::TokenFactor) {
31818 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
31819 if (ChainVal->getOperand(i).getNode() == LdVal) {
31820 TokenFactorIndex = i;
31821 Ld = cast<LoadSDNode>(St->getValue());
31823 Ops.push_back(ChainVal->getOperand(i));
31827 if (!Ld || !ISD::isNormalLoad(Ld))
31830 // If this is not the MMX case, i.e. we are just turning i64 load/store
31831 // into f64 load/store, avoid the transformation if there are multiple
31832 // uses of the loaded value.
31833 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
31838 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
31839 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
31841 if (Subtarget.is64Bit() || F64IsLegal) {
31842 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
31843 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
31844 Ld->getPointerInfo(), Ld->getAlignment(),
31845 Ld->getMemOperand()->getFlags());
31846 SDValue NewChain = NewLd.getValue(1);
31847 if (TokenFactorIndex >= 0) {
31848 Ops.push_back(NewChain);
31849 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31851 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
31852 St->getPointerInfo(), St->getAlignment(),
31853 St->getMemOperand()->getFlags());
31856 // Otherwise, lower to two pairs of 32-bit loads / stores.
31857 SDValue LoAddr = Ld->getBasePtr();
31858 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
31860 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
31861 Ld->getPointerInfo(), Ld->getAlignment(),
31862 Ld->getMemOperand()->getFlags());
31863 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
31864 Ld->getPointerInfo().getWithOffset(4),
31865 MinAlign(Ld->getAlignment(), 4),
31866 Ld->getMemOperand()->getFlags());
31868 SDValue NewChain = LoLd.getValue(1);
31869 if (TokenFactorIndex >= 0) {
31870 Ops.push_back(LoLd);
31871 Ops.push_back(HiLd);
31872 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31875 LoAddr = St->getBasePtr();
31876 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
31879 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
31880 St->getAlignment(), St->getMemOperand()->getFlags());
31881 SDValue HiSt = DAG.getStore(
31882 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
31883 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
31884 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
31887 // This is similar to the above case, but here we handle a scalar 64-bit
31888 // integer store that is extracted from a vector on a 32-bit target.
31889 // If we have SSE2, then we can treat it like a floating-point double
31890 // to get past legalization. The execution dependencies fixup pass will
31891 // choose the optimal machine instruction for the store if this really is
31892 // an integer or v2f32 rather than an f64.
31893 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
31894 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
31895 SDValue OldExtract = St->getOperand(1);
31896 SDValue ExtOp0 = OldExtract.getOperand(0);
31897 unsigned VecSize = ExtOp0.getValueSizeInBits();
31898 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
31899 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
31900 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
31901 BitCast, OldExtract.getOperand(1));
31902 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
31903 St->getPointerInfo(), St->getAlignment(),
31904 St->getMemOperand()->getFlags());
31910 /// Return 'true' if this vector operation is "horizontal"
31911 /// and return the operands for the horizontal operation in LHS and RHS. A
31912 /// horizontal operation performs the binary operation on successive elements
31913 /// of its first operand, then on successive elements of its second operand,
31914 /// returning the resulting values in a vector. For example, if
31915 /// A = < float a0, float a1, float a2, float a3 >
31917 /// B = < float b0, float b1, float b2, float b3 >
31918 /// then the result of doing a horizontal operation on A and B is
31919 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
31920 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
31921 /// A horizontal-op B, for some already available A and B, and if so then LHS is
31922 /// set to A, RHS to B, and the routine returns 'true'.
31923 /// Note that the binary operation should have the property that if one of the
31924 /// operands is UNDEF then the result is UNDEF.
31925 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
31926 // Look for the following pattern: if
31927 // A = < float a0, float a1, float a2, float a3 >
31928 // B = < float b0, float b1, float b2, float b3 >
31930 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
31931 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
31932 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
31933 // which is A horizontal-op B.
31935 // At least one of the operands should be a vector shuffle.
31936 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
31937 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
31940 MVT VT = LHS.getSimpleValueType();
31942 assert((VT.is128BitVector() || VT.is256BitVector()) &&
31943 "Unsupported vector type for horizontal add/sub");
31945 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
31946 // operate independently on 128-bit lanes.
31947 unsigned NumElts = VT.getVectorNumElements();
31948 unsigned NumLanes = VT.getSizeInBits()/128;
31949 unsigned NumLaneElts = NumElts / NumLanes;
31950 assert((NumLaneElts % 2 == 0) &&
31951 "Vector type should have an even number of elements in each lane");
31952 unsigned HalfLaneElts = NumLaneElts/2;
31954 // View LHS in the form
31955 // LHS = VECTOR_SHUFFLE A, B, LMask
31956 // If LHS is not a shuffle then pretend it is the shuffle
31957 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
31958 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
31961 SmallVector<int, 16> LMask(NumElts);
31962 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31963 if (!LHS.getOperand(0).isUndef())
31964 A = LHS.getOperand(0);
31965 if (!LHS.getOperand(1).isUndef())
31966 B = LHS.getOperand(1);
31967 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
31968 std::copy(Mask.begin(), Mask.end(), LMask.begin());
31970 if (!LHS.isUndef())
31972 for (unsigned i = 0; i != NumElts; ++i)
31976 // Likewise, view RHS in the form
31977 // RHS = VECTOR_SHUFFLE C, D, RMask
31979 SmallVector<int, 16> RMask(NumElts);
31980 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31981 if (!RHS.getOperand(0).isUndef())
31982 C = RHS.getOperand(0);
31983 if (!RHS.getOperand(1).isUndef())
31984 D = RHS.getOperand(1);
31985 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
31986 std::copy(Mask.begin(), Mask.end(), RMask.begin());
31988 if (!RHS.isUndef())
31990 for (unsigned i = 0; i != NumElts; ++i)
31994 // Check that the shuffles are both shuffling the same vectors.
31995 if (!(A == C && B == D) && !(A == D && B == C))
31998 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
31999 if (!A.getNode() && !B.getNode())
32002 // If A and B occur in reverse order in RHS, then "swap" them (which means
32003 // rewriting the mask).
32005 ShuffleVectorSDNode::commuteMask(RMask);
32007 // At this point LHS and RHS are equivalent to
32008 // LHS = VECTOR_SHUFFLE A, B, LMask
32009 // RHS = VECTOR_SHUFFLE A, B, RMask
32010 // Check that the masks correspond to performing a horizontal operation.
32011 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
32012 for (unsigned i = 0; i != NumLaneElts; ++i) {
32013 int LIdx = LMask[i+l], RIdx = RMask[i+l];
32015 // Ignore any UNDEF components.
32016 if (LIdx < 0 || RIdx < 0 ||
32017 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
32018 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
32021 // Check that successive elements are being operated on. If not, this is
32022 // not a horizontal operation.
32023 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
32024 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
32025 if (!(LIdx == Index && RIdx == Index + 1) &&
32026 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
32031 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
32032 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
32036 /// Do target-specific dag combines on floating-point adds/subs.
32037 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
32038 const X86Subtarget &Subtarget) {
32039 EVT VT = N->getValueType(0);
32040 SDValue LHS = N->getOperand(0);
32041 SDValue RHS = N->getOperand(1);
32042 bool IsFadd = N->getOpcode() == ISD::FADD;
32043 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
32045 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
32046 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
32047 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
32048 isHorizontalBinOp(LHS, RHS, IsFadd)) {
32049 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
32050 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
32055 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
32057 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
32058 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
32059 const X86Subtarget &Subtarget,
32061 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
32062 SDValue Src = N->getOperand(0);
32063 unsigned Opcode = Src.getOpcode();
32064 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32066 EVT VT = N->getValueType(0);
32067 EVT SrcVT = Src.getValueType();
32069 auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
32070 // TODO: Add extra cases where we can truncate both inputs for the
32071 // cost of one (or none).
32072 // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
32076 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
32077 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
32078 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
32079 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
32082 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
32083 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
32084 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
32085 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
32088 // Don't combine if the operation has other uses.
32089 if (!N->isOnlyUserOf(Src.getNode()))
32092 // Only support vector truncation for now.
32093 // TODO: i64 scalar math would benefit as well.
32094 if (!VT.isVector())
32097 // In most cases its only worth pre-truncating if we're only facing the cost
32098 // of one truncation.
32099 // i.e. if one of the inputs will constant fold or the input is repeated.
32104 SDValue Op0 = Src.getOperand(0);
32105 SDValue Op1 = Src.getOperand(1);
32106 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
32107 IsRepeatedOpOrOneUseConstant(Op0, Op1))
32108 return TruncateArithmetic(Op0, Op1);
32113 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
32114 // better to truncate if we have the chance.
32115 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
32116 !TLI.isOperationLegal(Opcode, SrcVT))
32117 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
32120 SDValue Op0 = Src.getOperand(0);
32121 SDValue Op1 = Src.getOperand(1);
32122 if (TLI.isOperationLegal(Opcode, VT) &&
32123 IsRepeatedOpOrOneUseConstant(Op0, Op1))
32124 return TruncateArithmetic(Op0, Op1);
32132 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
32134 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
32135 SmallVector<SDValue, 8> &Regs) {
32136 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
32137 Regs[0].getValueType() == MVT::v2i64));
32138 EVT OutVT = N->getValueType(0);
32139 EVT OutSVT = OutVT.getVectorElementType();
32140 EVT InVT = Regs[0].getValueType();
32141 EVT InSVT = InVT.getVectorElementType();
32144 // First, use mask to unset all bits that won't appear in the result.
32145 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
32146 "OutSVT can only be either i8 or i16.");
32148 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
32149 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
32150 for (auto &Reg : Regs)
32151 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
32153 MVT UnpackedVT, PackedVT;
32154 if (OutSVT == MVT::i8) {
32155 UnpackedVT = MVT::v8i16;
32156 PackedVT = MVT::v16i8;
32158 UnpackedVT = MVT::v4i32;
32159 PackedVT = MVT::v8i16;
32162 // In each iteration, truncate the type by a half size.
32163 auto RegNum = Regs.size();
32164 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
32165 j < e; j *= 2, RegNum /= 2) {
32166 for (unsigned i = 0; i < RegNum; i++)
32167 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
32168 for (unsigned i = 0; i < RegNum / 2; i++)
32169 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
32173 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
32174 // then extract a subvector as the result since v8i8 is not a legal type.
32175 if (OutVT == MVT::v8i8) {
32176 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
32177 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
32178 DAG.getIntPtrConstant(0, DL));
32180 } else if (RegNum > 1) {
32181 Regs.resize(RegNum);
32182 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
32187 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
32189 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
32191 SmallVector<SDValue, 8> &Regs) {
32192 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
32193 EVT OutVT = N->getValueType(0);
32196 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
32197 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
32198 for (auto &Reg : Regs) {
32199 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
32201 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
32205 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
32206 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
32209 if (Regs.size() > 2) {
32210 Regs.resize(Regs.size() / 2);
32211 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
32216 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
32217 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
32218 /// legalization the truncation will be translated into a BUILD_VECTOR with each
32219 /// element that is extracted from a vector and then truncated, and it is
32220 /// difficult to do this optimization based on them.
32221 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
32222 const X86Subtarget &Subtarget) {
32223 EVT OutVT = N->getValueType(0);
32224 if (!OutVT.isVector())
32227 SDValue In = N->getOperand(0);
32228 if (!In.getValueType().isSimple())
32231 EVT InVT = In.getValueType();
32232 unsigned NumElems = OutVT.getVectorNumElements();
32234 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
32235 // SSE2, and we need to take care of it specially.
32236 // AVX512 provides vpmovdb.
32237 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
32240 EVT OutSVT = OutVT.getVectorElementType();
32241 EVT InSVT = InVT.getVectorElementType();
32242 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
32243 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
32247 // SSSE3's pshufb results in less instructions in the cases below.
32248 if (Subtarget.hasSSSE3() && NumElems == 8 &&
32249 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
32250 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
32255 // Split a long vector into vectors of legal type.
32256 unsigned RegNum = InVT.getSizeInBits() / 128;
32257 SmallVector<SDValue, 8> SubVec(RegNum);
32258 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
32259 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
32261 for (unsigned i = 0; i < RegNum; i++)
32262 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
32263 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
32265 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
32266 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
32267 // truncate 2 x v4i32 to v8i16.
32268 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
32269 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
32270 else if (InSVT == MVT::i32)
32271 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
32276 /// This function transforms vector truncation of 'all or none' bits values.
32277 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
32278 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
32280 const X86Subtarget &Subtarget) {
32281 // Requires SSE2 but AVX512 has fast truncate.
32282 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
32285 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
32288 SDValue In = N->getOperand(0);
32289 if (!In.getValueType().isSimple())
32292 MVT VT = N->getValueType(0).getSimpleVT();
32293 MVT SVT = VT.getScalarType();
32295 MVT InVT = In.getValueType().getSimpleVT();
32296 MVT InSVT = InVT.getScalarType();
32298 // Use PACKSS if the input is a splatted sign bit.
32299 // e.g. Comparison result, sext_in_reg, etc.
32300 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
32301 if (NumSignBits != InSVT.getSizeInBits())
32304 // Check we have a truncation suited for PACKSS.
32305 if (!VT.is128BitVector() && !VT.is256BitVector())
32307 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
32309 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
32312 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
32315 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
32316 const X86Subtarget &Subtarget) {
32317 EVT VT = N->getValueType(0);
32318 SDValue Src = N->getOperand(0);
32321 // Attempt to pre-truncate inputs to arithmetic ops instead.
32322 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
32325 // Try to detect AVG pattern first.
32326 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
32329 // Try the truncation with unsigned saturation.
32330 if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
32331 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
32333 // The bitcast source is a direct mmx result.
32334 // Detect bitcasts between i32 to x86mmx
32335 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
32336 SDValue BCSrc = Src.getOperand(0);
32337 if (BCSrc.getValueType() == MVT::x86mmx)
32338 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
32341 // Try to truncate extended sign bits with PACKSS.
32342 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
32345 return combineVectorTruncation(N, DAG, Subtarget);
32348 /// Returns the negated value if the node \p N flips sign of FP value.
32350 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
32351 /// AVX512F does not have FXOR, so FNEG is lowered as
32352 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
32353 /// In this case we go though all bitcasts.
32354 static SDValue isFNEG(SDNode *N) {
32355 if (N->getOpcode() == ISD::FNEG)
32356 return N->getOperand(0);
32358 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
32359 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
32362 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
32363 if (!Op1.getValueType().isFloatingPoint())
32366 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
32368 unsigned EltBits = Op1.getScalarValueSizeInBits();
32369 auto isSignBitValue = [&](const ConstantFP *C) {
32370 return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
32373 // There is more than one way to represent the same constant on
32374 // the different X86 targets. The type of the node may also depend on size.
32375 // - load scalar value and broadcast
32376 // - BUILD_VECTOR node
32377 // - load from a constant pool.
32378 // We check all variants here.
32379 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
32380 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
32381 if (isSignBitValue(cast<ConstantFP>(C)))
32384 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
32385 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
32386 if (isSignBitValue(CN->getConstantFPValue()))
32389 } else if (auto *C = getTargetConstantFromNode(Op1)) {
32390 if (C->getType()->isVectorTy()) {
32391 if (auto *SplatV = C->getSplatValue())
32392 if (isSignBitValue(cast<ConstantFP>(SplatV)))
32394 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
32395 if (isSignBitValue(FPConst))
32401 /// Do target-specific dag combines on floating point negations.
32402 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
32403 const X86Subtarget &Subtarget) {
32404 EVT OrigVT = N->getValueType(0);
32405 SDValue Arg = isFNEG(N);
32406 assert(Arg.getNode() && "N is expected to be an FNEG node");
32408 EVT VT = Arg.getValueType();
32409 EVT SVT = VT.getScalarType();
32412 // Let legalize expand this if it isn't a legal type yet.
32413 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32416 // If we're negating a FMUL node on a target with FMA, then we can avoid the
32417 // use of a constant by performing (-0 - A*B) instead.
32418 // FIXME: Check rounding control flags as well once it becomes available.
32419 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
32420 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
32421 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
32422 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
32423 Arg.getOperand(1), Zero);
32424 return DAG.getBitcast(OrigVT, NewNode);
32427 // If we're negating an FMA node, then we can adjust the
32428 // instruction to include the extra negation.
32429 unsigned NewOpcode = 0;
32430 if (Arg.hasOneUse()) {
32431 switch (Arg.getOpcode()) {
32432 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
32433 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
32434 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
32435 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
32436 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
32437 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
32438 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
32439 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
32440 // We can't handle scalar intrinsic node here because it would only
32441 // invert one element and not the whole vector. But we could try to handle
32442 // a negation of the lower element only.
32446 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
32447 Arg.getNode()->ops()));
32452 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
32453 const X86Subtarget &Subtarget) {
32454 MVT VT = N->getSimpleValueType(0);
32455 // If we have integer vector types available, use the integer opcodes.
32456 if (VT.isVector() && Subtarget.hasSSE2()) {
32459 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
32461 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
32462 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
32463 unsigned IntOpcode;
32464 switch (N->getOpcode()) {
32465 default: llvm_unreachable("Unexpected FP logic op");
32466 case X86ISD::FOR: IntOpcode = ISD::OR; break;
32467 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
32468 case X86ISD::FAND: IntOpcode = ISD::AND; break;
32469 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
32471 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
32472 return DAG.getBitcast(VT, IntOp);
32477 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
32478 TargetLowering::DAGCombinerInfo &DCI,
32479 const X86Subtarget &Subtarget) {
32480 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
32483 if (DCI.isBeforeLegalizeOps())
32486 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
32489 if (Subtarget.hasCMov())
32490 if (SDValue RV = combineIntegerAbs(N, DAG))
32493 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32497 return combineFneg(N, DAG, Subtarget);
32502 static bool isNullFPScalarOrVectorConst(SDValue V) {
32503 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
32506 /// If a value is a scalar FP zero or a vector FP zero (potentially including
32507 /// undefined elements), return a zero constant that may be used to fold away
32508 /// that value. In the case of a vector, the returned constant will not contain
32509 /// undefined elements even if the input parameter does. This makes it suitable
32510 /// to be used as a replacement operand with operations (eg, bitwise-and) where
32511 /// an undef should not propagate.
32512 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
32513 const X86Subtarget &Subtarget) {
32514 if (!isNullFPScalarOrVectorConst(V))
32517 if (V.getValueType().isVector())
32518 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
32523 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
32524 const X86Subtarget &Subtarget) {
32525 SDValue N0 = N->getOperand(0);
32526 SDValue N1 = N->getOperand(1);
32527 EVT VT = N->getValueType(0);
32530 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
32531 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
32532 (VT == MVT::f64 && Subtarget.hasSSE2())))
32535 auto isAllOnesConstantFP = [](SDValue V) {
32536 auto *C = dyn_cast<ConstantFPSDNode>(V);
32537 return C && C->getConstantFPValue()->isAllOnesValue();
32540 // fand (fxor X, -1), Y --> fandn X, Y
32541 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
32542 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
32544 // fand X, (fxor Y, -1) --> fandn Y, X
32545 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
32546 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
32551 /// Do target-specific dag combines on X86ISD::FAND nodes.
32552 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
32553 const X86Subtarget &Subtarget) {
32554 // FAND(0.0, x) -> 0.0
32555 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
32558 // FAND(x, 0.0) -> 0.0
32559 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32562 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
32565 return lowerX86FPLogicOp(N, DAG, Subtarget);
32568 /// Do target-specific dag combines on X86ISD::FANDN nodes.
32569 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
32570 const X86Subtarget &Subtarget) {
32571 // FANDN(0.0, x) -> x
32572 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32573 return N->getOperand(1);
32575 // FANDN(x, 0.0) -> 0.0
32576 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32579 return lowerX86FPLogicOp(N, DAG, Subtarget);
32582 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
32583 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
32584 const X86Subtarget &Subtarget) {
32585 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
32587 // F[X]OR(0.0, x) -> x
32588 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32589 return N->getOperand(1);
32591 // F[X]OR(x, 0.0) -> x
32592 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
32593 return N->getOperand(0);
32596 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
32599 return lowerX86FPLogicOp(N, DAG, Subtarget);
32602 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
32603 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
32604 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
32606 // Only perform optimizations if UnsafeMath is used.
32607 if (!DAG.getTarget().Options.UnsafeFPMath)
32610 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
32611 // into FMINC and FMAXC, which are Commutative operations.
32612 unsigned NewOp = 0;
32613 switch (N->getOpcode()) {
32614 default: llvm_unreachable("unknown opcode");
32615 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
32616 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
32619 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
32620 N->getOperand(0), N->getOperand(1));
32623 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
32624 const X86Subtarget &Subtarget) {
32625 if (Subtarget.useSoftFloat())
32628 // TODO: Check for global or instruction-level "nnan". In that case, we
32629 // should be able to lower to FMAX/FMIN alone.
32630 // TODO: If an operand is already known to be a NaN or not a NaN, this
32631 // should be an optional swap and FMAX/FMIN.
32633 EVT VT = N->getValueType(0);
32634 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
32635 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
32636 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
32639 // This takes at least 3 instructions, so favor a library call when operating
32640 // on a scalar and minimizing code size.
32641 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
32644 SDValue Op0 = N->getOperand(0);
32645 SDValue Op1 = N->getOperand(1);
32647 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
32648 DAG.getDataLayout(), *DAG.getContext(), VT);
32650 // There are 4 possibilities involving NaN inputs, and these are the required
32654 // ----------------
32655 // Num | Max | Op0 |
32656 // Op0 ----------------
32657 // NaN | Op1 | NaN |
32658 // ----------------
32660 // The SSE FP max/min instructions were not designed for this case, but rather
32662 // Min = Op1 < Op0 ? Op1 : Op0
32663 // Max = Op1 > Op0 ? Op1 : Op0
32665 // So they always return Op0 if either input is a NaN. However, we can still
32666 // use those instructions for fmaxnum by selecting away a NaN input.
32668 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
32669 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
32670 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
32671 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
32673 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
32674 // are NaN, the NaN value of Op1 is the result.
32675 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
32676 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
32679 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
32680 TargetLowering::DAGCombinerInfo &DCI) {
32681 // BT ignores high bits in the bit index operand.
32682 SDValue Op1 = N->getOperand(1);
32683 if (Op1.hasOneUse()) {
32684 unsigned BitWidth = Op1.getValueSizeInBits();
32685 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
32686 APInt KnownZero, KnownOne;
32687 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32688 !DCI.isBeforeLegalizeOps());
32689 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32690 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
32691 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
32692 DCI.CommitTargetLoweringOpt(TLO);
32697 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
32698 const X86Subtarget &Subtarget) {
32699 EVT VT = N->getValueType(0);
32700 if (!VT.isVector())
32703 SDValue N0 = N->getOperand(0);
32704 SDValue N1 = N->getOperand(1);
32705 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
32708 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
32709 // both SSE and AVX2 since there is no sign-extended shift right
32710 // operation on a vector with 64-bit elements.
32711 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
32712 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
32713 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
32714 N0.getOpcode() == ISD::SIGN_EXTEND)) {
32715 SDValue N00 = N0.getOperand(0);
32717 // EXTLOAD has a better solution on AVX2,
32718 // it may be replaced with X86ISD::VSEXT node.
32719 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
32720 if (!ISD::isNormalLoad(N00.getNode()))
32723 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
32724 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
32726 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
32732 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
32733 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
32734 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
32735 /// opportunities to combine math ops, use an LEA, or use a complex addressing
32736 /// mode. This can eliminate extend, add, and shift instructions.
32737 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
32738 const X86Subtarget &Subtarget) {
32739 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
32740 Ext->getOpcode() != ISD::ZERO_EXTEND)
32743 // TODO: This should be valid for other integer types.
32744 EVT VT = Ext->getValueType(0);
32745 if (VT != MVT::i64)
32748 SDValue Add = Ext->getOperand(0);
32749 if (Add.getOpcode() != ISD::ADD)
32752 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
32753 bool NSW = Add->getFlags()->hasNoSignedWrap();
32754 bool NUW = Add->getFlags()->hasNoUnsignedWrap();
32756 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
32758 if ((Sext && !NSW) || (!Sext && !NUW))
32761 // Having a constant operand to the 'add' ensures that we are not increasing
32762 // the instruction count because the constant is extended for free below.
32763 // A constant operand can also become the displacement field of an LEA.
32764 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
32768 // Don't make the 'add' bigger if there's no hope of combining it with some
32769 // other 'add' or 'shl' instruction.
32770 // TODO: It may be profitable to generate simpler LEA instructions in place
32771 // of single 'add' instructions, but the cost model for selecting an LEA
32772 // currently has a high threshold.
32773 bool HasLEAPotential = false;
32774 for (auto *User : Ext->uses()) {
32775 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
32776 HasLEAPotential = true;
32780 if (!HasLEAPotential)
32783 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
32784 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
32785 SDValue AddOp0 = Add.getOperand(0);
32786 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
32787 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
32789 // The wider add is guaranteed to not wrap because both operands are
32792 Flags.setNoSignedWrap(NSW);
32793 Flags.setNoUnsignedWrap(NUW);
32794 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
32797 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
32798 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
32799 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
32800 /// extends from AH (which we otherwise need to do contortions to access).
32801 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
32802 SDValue N0 = N->getOperand(0);
32803 auto OpcodeN = N->getOpcode();
32804 auto OpcodeN0 = N0.getOpcode();
32805 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
32806 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
32809 EVT VT = N->getValueType(0);
32810 EVT InVT = N0.getValueType();
32811 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
32814 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
32815 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
32816 : X86ISD::UDIVREM8_ZEXT_HREG;
32817 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
32819 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
32820 return R.getValue(1);
32823 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
32824 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
32825 /// with UNDEFs) of the input to vectors of the same size as the target type
32826 /// which then extends the lowest elements.
32827 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
32828 TargetLowering::DAGCombinerInfo &DCI,
32829 const X86Subtarget &Subtarget) {
32830 unsigned Opcode = N->getOpcode();
32831 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
32833 if (!DCI.isBeforeLegalizeOps())
32835 if (!Subtarget.hasSSE2())
32838 SDValue N0 = N->getOperand(0);
32839 EVT VT = N->getValueType(0);
32840 EVT SVT = VT.getScalarType();
32841 EVT InVT = N0.getValueType();
32842 EVT InSVT = InVT.getScalarType();
32844 // Input type must be a vector and we must be extending legal integer types.
32845 if (!VT.isVector())
32847 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
32849 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
32852 // On AVX2+ targets, if the input/output types are both legal then we will be
32853 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
32854 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
32855 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
32860 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
32861 EVT InVT = N.getValueType();
32862 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
32863 Size / InVT.getScalarSizeInBits());
32864 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
32865 DAG.getUNDEF(InVT));
32867 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
32870 // If target-size is less than 128-bits, extend to a type that would extend
32871 // to 128 bits, extend that and extract the original target vector.
32872 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
32873 unsigned Scale = 128 / VT.getSizeInBits();
32875 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
32876 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
32877 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
32878 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
32879 DAG.getIntPtrConstant(0, DL));
32882 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
32883 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
32884 // Also use this if we don't have SSE41 to allow the legalizer do its job.
32885 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
32886 (VT.is256BitVector() && Subtarget.hasInt256()) ||
32887 (VT.is512BitVector() && Subtarget.hasAVX512())) {
32888 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
32889 return Opcode == ISD::SIGN_EXTEND
32890 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
32891 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
32894 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
32895 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
32896 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
32897 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
32898 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
32900 SmallVector<SDValue, 8> Opnds;
32901 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
32902 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
32903 DAG.getIntPtrConstant(Offset, DL));
32904 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
32905 SrcVec = Opcode == ISD::SIGN_EXTEND
32906 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
32907 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
32908 Opnds.push_back(SrcVec);
32910 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
32913 // On pre-AVX2 targets, split into 128-bit nodes of
32914 // ISD::*_EXTEND_VECTOR_INREG.
32915 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
32916 return SplitAndExtendInReg(128);
32918 // On pre-AVX512 targets, split into 256-bit nodes of
32919 // ISD::*_EXTEND_VECTOR_INREG.
32920 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
32921 return SplitAndExtendInReg(256);
32926 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
32927 TargetLowering::DAGCombinerInfo &DCI,
32928 const X86Subtarget &Subtarget) {
32929 SDValue N0 = N->getOperand(0);
32930 EVT VT = N->getValueType(0);
32931 EVT InVT = N0.getValueType();
32934 if (SDValue DivRem8 = getDivRem8(N, DAG))
32937 if (!DCI.isBeforeLegalizeOps()) {
32938 if (InVT == MVT::i1) {
32939 SDValue Zero = DAG.getConstant(0, DL, VT);
32941 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
32942 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
32947 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
32950 if (Subtarget.hasAVX() && VT.is256BitVector())
32951 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
32954 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
32960 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
32961 const X86Subtarget &Subtarget) {
32963 EVT VT = N->getValueType(0);
32965 // Let legalize expand this if it isn't a legal type yet.
32966 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32969 EVT ScalarVT = VT.getScalarType();
32970 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
32973 SDValue A = N->getOperand(0);
32974 SDValue B = N->getOperand(1);
32975 SDValue C = N->getOperand(2);
32977 auto invertIfNegative = [](SDValue &V) {
32978 if (SDValue NegVal = isFNEG(V.getNode())) {
32985 // Do not convert the passthru input of scalar intrinsics.
32986 // FIXME: We could allow negations of the lower element only.
32987 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
32988 bool NegB = invertIfNegative(B);
32989 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
32991 // Negative multiplication when NegA xor NegB
32992 bool NegMul = (NegA != NegB);
32994 unsigned NewOpcode;
32996 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
32998 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
33001 if (N->getOpcode() == X86ISD::FMADD_RND) {
33002 switch (NewOpcode) {
33003 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
33004 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
33005 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
33006 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
33008 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
33009 switch (NewOpcode) {
33010 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
33011 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
33012 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
33013 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
33015 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
33016 switch (NewOpcode) {
33017 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
33018 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
33019 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
33020 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
33023 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
33024 "Unexpected opcode!");
33025 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
33028 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
33031 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
33032 TargetLowering::DAGCombinerInfo &DCI,
33033 const X86Subtarget &Subtarget) {
33034 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
33035 // (and (i32 x86isd::setcc_carry), 1)
33036 // This eliminates the zext. This transformation is necessary because
33037 // ISD::SETCC is always legalized to i8.
33039 SDValue N0 = N->getOperand(0);
33040 EVT VT = N->getValueType(0);
33042 if (N0.getOpcode() == ISD::AND &&
33044 N0.getOperand(0).hasOneUse()) {
33045 SDValue N00 = N0.getOperand(0);
33046 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
33047 if (!isOneConstant(N0.getOperand(1)))
33049 return DAG.getNode(ISD::AND, dl, VT,
33050 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
33051 N00.getOperand(0), N00.getOperand(1)),
33052 DAG.getConstant(1, dl, VT));
33056 if (N0.getOpcode() == ISD::TRUNCATE &&
33058 N0.getOperand(0).hasOneUse()) {
33059 SDValue N00 = N0.getOperand(0);
33060 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
33061 return DAG.getNode(ISD::AND, dl, VT,
33062 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
33063 N00.getOperand(0), N00.getOperand(1)),
33064 DAG.getConstant(1, dl, VT));
33068 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
33071 if (VT.is256BitVector())
33072 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
33075 if (SDValue DivRem8 = getDivRem8(N, DAG))
33078 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
33081 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
33087 /// Optimize x == -y --> x+y == 0
33088 /// x != -y --> x+y != 0
33089 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
33090 const X86Subtarget &Subtarget) {
33091 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
33092 SDValue LHS = N->getOperand(0);
33093 SDValue RHS = N->getOperand(1);
33094 EVT VT = N->getValueType(0);
33097 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
33098 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
33099 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
33100 LHS.getOperand(1));
33101 return DAG.getSetCC(DL, N->getValueType(0), addV,
33102 DAG.getConstant(0, DL, addV.getValueType()), CC);
33104 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
33105 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
33106 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
33107 RHS.getOperand(1));
33108 return DAG.getSetCC(DL, N->getValueType(0), addV,
33109 DAG.getConstant(0, DL, addV.getValueType()), CC);
33112 if (VT.getScalarType() == MVT::i1 &&
33113 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
33115 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
33116 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
33117 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
33119 if (!IsSEXT0 || !IsVZero1) {
33120 // Swap the operands and update the condition code.
33121 std::swap(LHS, RHS);
33122 CC = ISD::getSetCCSwappedOperands(CC);
33124 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
33125 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
33126 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
33129 if (IsSEXT0 && IsVZero1) {
33130 assert(VT == LHS.getOperand(0).getValueType() &&
33131 "Uexpected operand type");
33132 if (CC == ISD::SETGT)
33133 return DAG.getConstant(0, DL, VT);
33134 if (CC == ISD::SETLE)
33135 return DAG.getConstant(1, DL, VT);
33136 if (CC == ISD::SETEQ || CC == ISD::SETGE)
33137 return DAG.getNOT(DL, LHS.getOperand(0), VT);
33139 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
33140 "Unexpected condition code!");
33141 return LHS.getOperand(0);
33145 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
33146 // to avoid scalarization via legalization because v4i32 is not a legal type.
33147 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
33148 LHS.getValueType() == MVT::v4f32)
33149 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
33154 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
33156 // Gather and Scatter instructions use k-registers for masks. The type of
33157 // the masks is v*i1. So the mask will be truncated anyway.
33158 // The SIGN_EXTEND_INREG my be dropped.
33159 SDValue Mask = N->getOperand(2);
33160 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
33161 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
33162 NewOps[2] = Mask.getOperand(0);
33163 DAG.UpdateNodeOperands(N, NewOps);
33168 // Helper function of performSETCCCombine. It is to materialize "setb reg"
33169 // as "sbb reg,reg", since it can be extended without zext and produces
33170 // an all-ones bit which is more useful than 0/1 in some cases.
33171 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
33172 SelectionDAG &DAG, MVT VT) {
33174 return DAG.getNode(ISD::AND, DL, VT,
33175 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
33176 DAG.getConstant(X86::COND_B, DL, MVT::i8),
33178 DAG.getConstant(1, DL, VT));
33179 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
33180 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
33181 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
33182 DAG.getConstant(X86::COND_B, DL, MVT::i8),
33186 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
33187 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
33188 TargetLowering::DAGCombinerInfo &DCI,
33189 const X86Subtarget &Subtarget) {
33191 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
33192 SDValue EFLAGS = N->getOperand(1);
33194 if (CC == X86::COND_A) {
33195 // Try to convert COND_A into COND_B in an attempt to facilitate
33196 // materializing "setb reg".
33198 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
33199 // cannot take an immediate as its first operand.
33201 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
33202 EFLAGS.getValueType().isInteger() &&
33203 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
33204 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
33205 EFLAGS.getNode()->getVTList(),
33206 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
33207 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
33208 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
33212 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
33213 // a zext and produces an all-ones bit which is more useful than 0/1 in some
33215 if (CC == X86::COND_B)
33216 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
33218 // Try to simplify the EFLAGS and condition code operands.
33219 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
33220 return getSETCC(CC, Flags, DL, DAG);
33225 /// Optimize branch condition evaluation.
33226 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
33227 TargetLowering::DAGCombinerInfo &DCI,
33228 const X86Subtarget &Subtarget) {
33230 SDValue EFLAGS = N->getOperand(3);
33231 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
33233 // Try to simplify the EFLAGS and condition code operands.
33234 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
33235 // RAUW them under us.
33236 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
33237 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
33238 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
33239 N->getOperand(1), Cond, Flags);
33245 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
33246 SelectionDAG &DAG) {
33247 // Take advantage of vector comparisons producing 0 or -1 in each lane to
33248 // optimize away operation when it's from a constant.
33250 // The general transformation is:
33251 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
33252 // AND(VECTOR_CMP(x,y), constant2)
33253 // constant2 = UNARYOP(constant)
33255 // Early exit if this isn't a vector operation, the operand of the
33256 // unary operation isn't a bitwise AND, or if the sizes of the operations
33257 // aren't the same.
33258 EVT VT = N->getValueType(0);
33259 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
33260 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
33261 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
33264 // Now check that the other operand of the AND is a constant. We could
33265 // make the transformation for non-constant splats as well, but it's unclear
33266 // that would be a benefit as it would not eliminate any operations, just
33267 // perform one more step in scalar code before moving to the vector unit.
33268 if (BuildVectorSDNode *BV =
33269 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
33270 // Bail out if the vector isn't a constant.
33271 if (!BV->isConstant())
33274 // Everything checks out. Build up the new and improved node.
33276 EVT IntVT = BV->getValueType(0);
33277 // Create a new constant of the appropriate type for the transformed
33279 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
33280 // The AND node needs bitcasts to/from an integer vector type around it.
33281 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
33282 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
33283 N->getOperand(0)->getOperand(0), MaskConst);
33284 SDValue Res = DAG.getBitcast(VT, NewAnd);
33291 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
33292 const X86Subtarget &Subtarget) {
33293 SDValue Op0 = N->getOperand(0);
33294 EVT VT = N->getValueType(0);
33295 EVT InVT = Op0.getValueType();
33296 EVT InSVT = InVT.getScalarType();
33297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33299 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
33300 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
33301 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
33303 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33304 InVT.getVectorNumElements());
33305 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
33307 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
33308 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
33310 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33313 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
33314 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
33315 // the optimization here.
33316 if (DAG.SignBitIsZero(Op0))
33317 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
33322 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
33323 const X86Subtarget &Subtarget) {
33324 // First try to optimize away the conversion entirely when it's
33325 // conditionally from a constant. Vectors only.
33326 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
33329 // Now move on to more general possibilities.
33330 SDValue Op0 = N->getOperand(0);
33331 EVT VT = N->getValueType(0);
33332 EVT InVT = Op0.getValueType();
33333 EVT InSVT = InVT.getScalarType();
33335 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
33336 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
33337 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
33338 if (InVT.isVector() &&
33339 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
33340 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
33342 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33343 InVT.getVectorNumElements());
33344 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
33345 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33348 // Without AVX512DQ we only support i64 to float scalar conversion. For both
33349 // vectors and scalars, see if we know that the upper bits are all the sign
33350 // bit, in which case we can truncate the input to i32 and convert from that.
33351 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
33352 unsigned BitWidth = InVT.getScalarSizeInBits();
33353 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
33354 if (NumSignBits >= (BitWidth - 31)) {
33355 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
33356 if (InVT.isVector())
33357 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
33358 InVT.getVectorNumElements());
33360 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
33361 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
33365 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
33366 // a 32-bit target where SSE doesn't support i64->FP operations.
33367 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
33368 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
33369 EVT LdVT = Ld->getValueType(0);
33371 // This transformation is not supported if the result type is f16 or f128.
33372 if (VT == MVT::f16 || VT == MVT::f128)
33375 if (!Ld->isVolatile() && !VT.isVector() &&
33376 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
33377 !Subtarget.is64Bit() && LdVT == MVT::i64) {
33378 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
33379 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
33380 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
33387 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
33388 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
33389 X86TargetLowering::DAGCombinerInfo &DCI) {
33390 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
33391 // the result is either zero or one (depending on the input carry bit).
33392 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
33393 if (X86::isZeroNode(N->getOperand(0)) &&
33394 X86::isZeroNode(N->getOperand(1)) &&
33395 // We don't have a good way to replace an EFLAGS use, so only do this when
33397 SDValue(N, 1).use_empty()) {
33399 EVT VT = N->getValueType(0);
33400 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
33401 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
33402 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
33403 DAG.getConstant(X86::COND_B, DL,
33406 DAG.getConstant(1, DL, VT));
33407 return DCI.CombineTo(N, Res1, CarryOut);
33413 /// fold (add Y, (sete X, 0)) -> adc 0, Y
33414 /// (add Y, (setne X, 0)) -> sbb -1, Y
33415 /// (sub (sete X, 0), Y) -> sbb 0, Y
33416 /// (sub (setne X, 0), Y) -> adc -1, Y
33417 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
33420 // Look through ZExts.
33421 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
33422 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
33425 SDValue SetCC = Ext.getOperand(0);
33426 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
33429 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
33430 if (CC != X86::COND_E && CC != X86::COND_NE)
33433 SDValue Cmp = SetCC.getOperand(1);
33434 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
33435 !X86::isZeroNode(Cmp.getOperand(1)) ||
33436 !Cmp.getOperand(0).getValueType().isInteger())
33439 SDValue CmpOp0 = Cmp.getOperand(0);
33440 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
33441 DAG.getConstant(1, DL, CmpOp0.getValueType()));
33443 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
33444 if (CC == X86::COND_NE)
33445 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
33446 DL, OtherVal.getValueType(), OtherVal,
33447 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
33449 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
33450 DL, OtherVal.getValueType(), OtherVal,
33451 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
33454 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
33455 const X86Subtarget &Subtarget) {
33457 EVT VT = N->getValueType(0);
33458 SDValue Op0 = N->getOperand(0);
33459 SDValue Op1 = N->getOperand(1);
33461 // TODO: There's nothing special about i32, any integer type above i16 should
33462 // work just as well.
33463 if (!VT.isVector() || !VT.isSimple() ||
33464 !(VT.getVectorElementType() == MVT::i32))
33467 unsigned RegSize = 128;
33468 if (Subtarget.hasBWI())
33470 else if (Subtarget.hasAVX2())
33473 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
33474 // TODO: We should be able to handle larger vectors by splitting them before
33475 // feeding them into several SADs, and then reducing over those.
33476 if (VT.getSizeInBits() / 4 > RegSize)
33479 // We know N is a reduction add, which means one of its operands is a phi.
33480 // To match SAD, we need the other operand to be a vector select.
33481 SDValue SelectOp, Phi;
33482 if (Op0.getOpcode() == ISD::VSELECT) {
33485 } else if (Op1.getOpcode() == ISD::VSELECT) {
33491 // Check whether we have an abs-diff pattern feeding into the select.
33492 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
33495 // SAD pattern detected. Now build a SAD instruction and an addition for
33496 // reduction. Note that the number of elements of the result of SAD is less
33497 // than the number of elements of its input. Therefore, we could only update
33498 // part of elements in the reduction vector.
33499 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
33501 // The output of PSADBW is a vector of i64.
33502 // We need to turn the vector of i64 into a vector of i32.
33503 // If the reduction vector is at least as wide as the psadbw result, just
33504 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
33506 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
33507 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
33508 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
33510 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
33512 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
33513 // Update part of elements of the reduction vector. This is done by first
33514 // extracting a sub-vector from it, updating this sub-vector, and inserting
33516 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
33517 DAG.getIntPtrConstant(0, DL));
33518 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
33519 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
33520 DAG.getIntPtrConstant(0, DL));
33522 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
33525 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
33526 const X86Subtarget &Subtarget) {
33527 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
33528 if (Flags->hasVectorReduction()) {
33529 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
33532 EVT VT = N->getValueType(0);
33533 SDValue Op0 = N->getOperand(0);
33534 SDValue Op1 = N->getOperand(1);
33536 // Try to synthesize horizontal adds from adds of shuffles.
33537 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33538 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33539 isHorizontalBinOp(Op0, Op1, true))
33540 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
33542 return OptimizeConditionalInDecrement(N, DAG);
33545 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
33546 const X86Subtarget &Subtarget) {
33547 SDValue Op0 = N->getOperand(0);
33548 SDValue Op1 = N->getOperand(1);
33550 // X86 can't encode an immediate LHS of a sub. See if we can push the
33551 // negation into a preceding instruction.
33552 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
33553 // If the RHS of the sub is a XOR with one use and a constant, invert the
33554 // immediate. Then add one to the LHS of the sub so we can turn
33555 // X-Y -> X+~Y+1, saving one register.
33556 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
33557 isa<ConstantSDNode>(Op1.getOperand(1))) {
33558 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
33559 EVT VT = Op0.getValueType();
33560 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
33562 DAG.getConstant(~XorC, SDLoc(Op1), VT));
33563 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
33564 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
33568 // Try to synthesize horizontal adds from adds of shuffles.
33569 EVT VT = N->getValueType(0);
33570 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33571 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33572 isHorizontalBinOp(Op0, Op1, true))
33573 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
33575 return OptimizeConditionalInDecrement(N, DAG);
33578 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
33579 TargetLowering::DAGCombinerInfo &DCI,
33580 const X86Subtarget &Subtarget) {
33582 unsigned Opcode = N->getOpcode();
33583 MVT VT = N->getSimpleValueType(0);
33584 MVT SVT = VT.getVectorElementType();
33585 SDValue Op = N->getOperand(0);
33586 MVT OpVT = Op.getSimpleValueType();
33587 MVT OpEltVT = OpVT.getVectorElementType();
33588 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
33590 // Perform any constant folding.
33591 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
33592 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
33593 unsigned NumDstElts = VT.getVectorNumElements();
33594 SmallBitVector Undefs(NumDstElts, false);
33595 SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
33596 for (unsigned i = 0; i != NumDstElts; ++i) {
33597 SDValue OpElt = Op.getOperand(i);
33598 if (OpElt.getOpcode() == ISD::UNDEF) {
33602 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
33603 Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
33604 : Cst.sextOrTrunc(SVT.getSizeInBits());
33606 return getConstVector(Vals, Undefs, VT, DAG, DL);
33609 // (vzext (bitcast (vzext (x)) -> (vzext x)
33610 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
33611 SDValue V = peekThroughBitcasts(Op);
33612 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
33613 MVT InnerVT = V.getSimpleValueType();
33614 MVT InnerEltVT = InnerVT.getVectorElementType();
33616 // If the element sizes match exactly, we can just do one larger vzext. This
33617 // is always an exact type match as vzext operates on integer types.
33618 if (OpEltVT == InnerEltVT) {
33619 assert(OpVT == InnerVT && "Types must match for vzext!");
33620 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
33623 // The only other way we can combine them is if only a single element of the
33624 // inner vzext is used in the input to the outer vzext.
33625 if (InnerEltVT.getSizeInBits() < InputBits)
33628 // In this case, the inner vzext is completely dead because we're going to
33629 // only look at bits inside of the low element. Just do the outer vzext on
33630 // a bitcast of the input to the inner.
33631 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
33634 // Check if we can bypass extracting and re-inserting an element of an input
33635 // vector. Essentially:
33636 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
33637 // TODO: Add X86ISD::VSEXT support
33638 if (Opcode == X86ISD::VZEXT &&
33639 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
33640 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33641 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
33642 SDValue ExtractedV = V.getOperand(0);
33643 SDValue OrigV = ExtractedV.getOperand(0);
33644 if (isNullConstant(ExtractedV.getOperand(1))) {
33645 MVT OrigVT = OrigV.getSimpleValueType();
33646 // Extract a subvector if necessary...
33647 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
33648 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
33649 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
33650 OrigVT.getVectorNumElements() / Ratio);
33651 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
33652 DAG.getIntPtrConstant(0, DL));
33654 Op = DAG.getBitcast(OpVT, OrigV);
33655 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
33662 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
33663 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
33664 const X86Subtarget &Subtarget) {
33665 SDValue Chain = N->getOperand(0);
33666 SDValue LHS = N->getOperand(1);
33667 SDValue RHS = N->getOperand(2);
33668 MVT VT = RHS.getSimpleValueType();
33671 auto *C = dyn_cast<ConstantSDNode>(RHS);
33672 if (!C || C->getZExtValue() != 1)
33675 RHS = DAG.getConstant(-1, DL, VT);
33676 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33677 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
33678 DAG.getVTList(MVT::i32, MVT::Other),
33679 {Chain, LHS, RHS}, VT, MMO);
33682 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
33683 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
33684 SDValue Op0 = N->getOperand(0);
33685 SDValue Op1 = N->getOperand(1);
33687 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
33690 EVT VT = N->getValueType(0);
33693 return DAG.getNode(X86ISD::TESTM, DL, VT,
33694 Op0->getOperand(0), Op0->getOperand(1));
33697 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
33698 const X86Subtarget &Subtarget) {
33699 MVT VT = N->getSimpleValueType(0);
33702 if (N->getOperand(0) == N->getOperand(1)) {
33703 if (N->getOpcode() == X86ISD::PCMPEQ)
33704 return getOnesVector(VT, Subtarget, DAG, DL);
33705 if (N->getOpcode() == X86ISD::PCMPGT)
33706 return getZeroVector(VT, Subtarget, DAG, DL);
33713 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
33714 DAGCombinerInfo &DCI) const {
33715 SelectionDAG &DAG = DCI.DAG;
33716 switch (N->getOpcode()) {
33718 case ISD::EXTRACT_VECTOR_ELT:
33719 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
33722 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
33723 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
33724 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
33725 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
33726 case ISD::SUB: return combineSub(N, DAG, Subtarget);
33727 case X86ISD::ADC: return combineADC(N, DAG, DCI);
33728 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
33731 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
33732 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
33733 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
33734 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
33735 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
33736 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
33737 case ISD::STORE: return combineStore(N, DAG, Subtarget);
33738 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
33739 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
33740 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
33742 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
33743 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
33744 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
33745 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
33746 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
33748 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
33750 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
33752 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
33753 case X86ISD::BT: return combineBT(N, DAG, DCI);
33754 case ISD::ANY_EXTEND:
33755 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
33756 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
33757 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
33758 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
33759 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
33760 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
33761 case X86ISD::VSHLI:
33762 case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
33763 case X86ISD::VSEXT:
33764 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
33765 case X86ISD::SHUFP: // Handle all target specific shuffles
33766 case X86ISD::INSERTPS:
33767 case X86ISD::PALIGNR:
33768 case X86ISD::VSHLDQ:
33769 case X86ISD::VSRLDQ:
33770 case X86ISD::BLENDI:
33771 case X86ISD::UNPCKH:
33772 case X86ISD::UNPCKL:
33773 case X86ISD::MOVHLPS:
33774 case X86ISD::MOVLHPS:
33775 case X86ISD::PSHUFB:
33776 case X86ISD::PSHUFD:
33777 case X86ISD::PSHUFHW:
33778 case X86ISD::PSHUFLW:
33779 case X86ISD::MOVSHDUP:
33780 case X86ISD::MOVSLDUP:
33781 case X86ISD::MOVDDUP:
33782 case X86ISD::MOVSS:
33783 case X86ISD::MOVSD:
33784 case X86ISD::VPPERM:
33785 case X86ISD::VPERMI:
33786 case X86ISD::VPERMV:
33787 case X86ISD::VPERMV3:
33788 case X86ISD::VPERMIV3:
33789 case X86ISD::VPERMIL2:
33790 case X86ISD::VPERMILPI:
33791 case X86ISD::VPERMILPV:
33792 case X86ISD::VPERM2X128:
33793 case X86ISD::VZEXT_MOVL:
33794 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
33795 case X86ISD::FMADD:
33796 case X86ISD::FMADD_RND:
33797 case X86ISD::FMADDS1_RND:
33798 case X86ISD::FMADDS3_RND:
33799 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
33801 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
33802 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
33803 case X86ISD::TESTM: return combineTestM(N, DAG);
33804 case X86ISD::PCMPEQ:
33805 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
33811 /// Return true if the target has native support for the specified value type
33812 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
33813 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
33814 /// some i16 instructions are slow.
33815 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
33816 if (!isTypeLegal(VT))
33818 if (VT != MVT::i16)
33825 case ISD::SIGN_EXTEND:
33826 case ISD::ZERO_EXTEND:
33827 case ISD::ANY_EXTEND:
33840 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
33841 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
33842 /// we don't adjust the stack we clobber the first frame index.
33843 /// See X86InstrInfo::copyPhysReg.
33844 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
33845 MachineFunction *MF) const {
33846 const MachineRegisterInfo &MRI = MF->getRegInfo();
33848 return any_of(MRI.reg_instructions(X86::EFLAGS),
33849 [](const MachineInstr &RI) { return RI.isCopy(); });
33852 /// This method query the target whether it is beneficial for dag combiner to
33853 /// promote the specified node. If true, it should return the desired promotion
33854 /// type by reference.
33855 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
33856 EVT VT = Op.getValueType();
33857 if (VT != MVT::i16)
33860 bool Promote = false;
33861 bool Commute = false;
33862 switch (Op.getOpcode()) {
33864 case ISD::SIGN_EXTEND:
33865 case ISD::ZERO_EXTEND:
33866 case ISD::ANY_EXTEND:
33871 SDValue N0 = Op.getOperand(0);
33872 // Look out for (store (shl (load), x)).
33873 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
33886 SDValue N0 = Op.getOperand(0);
33887 SDValue N1 = Op.getOperand(1);
33888 if (!Commute && MayFoldLoad(N1))
33890 // Avoid disabling potential load folding opportunities.
33891 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
33893 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
33903 //===----------------------------------------------------------------------===//
33904 // X86 Inline Assembly Support
33905 //===----------------------------------------------------------------------===//
33907 // Helper to match a string separated by whitespace.
33908 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
33909 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
33911 for (StringRef Piece : Pieces) {
33912 if (!S.startswith(Piece)) // Check if the piece matches.
33915 S = S.substr(Piece.size());
33916 StringRef::size_type Pos = S.find_first_not_of(" \t");
33917 if (Pos == 0) // We matched a prefix.
33926 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
33928 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
33929 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
33930 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
33931 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
33933 if (AsmPieces.size() == 3)
33935 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
33942 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
33943 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
33945 const std::string &AsmStr = IA->getAsmString();
33947 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
33948 if (!Ty || Ty->getBitWidth() % 16 != 0)
33951 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
33952 SmallVector<StringRef, 4> AsmPieces;
33953 SplitString(AsmStr, AsmPieces, ";\n");
33955 switch (AsmPieces.size()) {
33956 default: return false;
33958 // FIXME: this should verify that we are targeting a 486 or better. If not,
33959 // we will turn this bswap into something that will be lowered to logical
33960 // ops instead of emitting the bswap asm. For now, we don't support 486 or
33961 // lower so don't worry about this.
33963 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
33964 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
33965 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
33966 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
33967 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
33968 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
33969 // No need to check constraints, nothing other than the equivalent of
33970 // "=r,0" would be valid here.
33971 return IntrinsicLowering::LowerToByteSwap(CI);
33974 // rorw $$8, ${0:w} --> llvm.bswap.i16
33975 if (CI->getType()->isIntegerTy(16) &&
33976 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33977 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
33978 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
33980 StringRef ConstraintsStr = IA->getConstraintString();
33981 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33982 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33983 if (clobbersFlagRegisters(AsmPieces))
33984 return IntrinsicLowering::LowerToByteSwap(CI);
33988 if (CI->getType()->isIntegerTy(32) &&
33989 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33990 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
33991 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
33992 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
33994 StringRef ConstraintsStr = IA->getConstraintString();
33995 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33996 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33997 if (clobbersFlagRegisters(AsmPieces))
33998 return IntrinsicLowering::LowerToByteSwap(CI);
34001 if (CI->getType()->isIntegerTy(64)) {
34002 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
34003 if (Constraints.size() >= 2 &&
34004 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
34005 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
34006 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
34007 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
34008 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
34009 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
34010 return IntrinsicLowering::LowerToByteSwap(CI);
34018 /// Given a constraint letter, return the type of constraint for this target.
34019 X86TargetLowering::ConstraintType
34020 X86TargetLowering::getConstraintType(StringRef Constraint) const {
34021 if (Constraint.size() == 1) {
34022 switch (Constraint[0]) {
34034 return C_RegisterClass;
34035 case 'k': // AVX512 masking registers.
34059 else if (Constraint.size() == 2) {
34060 switch (Constraint[0]) {
34064 switch (Constraint[1]) {
34072 return TargetLowering::getConstraintType(Constraint);
34075 /// Examine constraint type and operand type and determine a weight value.
34076 /// This object must already have been set up with the operand type
34077 /// and the current alternative constraint selected.
34078 TargetLowering::ConstraintWeight
34079 X86TargetLowering::getSingleConstraintMatchWeight(
34080 AsmOperandInfo &info, const char *constraint) const {
34081 ConstraintWeight weight = CW_Invalid;
34082 Value *CallOperandVal = info.CallOperandVal;
34083 // If we don't have a value, we can't do a match,
34084 // but allow it at the lowest weight.
34085 if (!CallOperandVal)
34087 Type *type = CallOperandVal->getType();
34088 // Look at the constraint type.
34089 switch (*constraint) {
34091 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
34102 if (CallOperandVal->getType()->isIntegerTy())
34103 weight = CW_SpecificReg;
34108 if (type->isFloatingPointTy())
34109 weight = CW_SpecificReg;
34112 if (type->isX86_MMXTy() && Subtarget.hasMMX())
34113 weight = CW_SpecificReg;
34116 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
34117 if (constraint[1] == 'k') {
34118 // Support for 'Yk' (similarly to the 'k' variant below).
34119 weight = CW_SpecificReg;
34122 // Else fall through (handle "Y" constraint).
34125 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
34126 weight = CW_Register;
34129 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
34130 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
34131 weight = CW_Register;
34134 // Enable conditional vector operations using %k<#> registers.
34135 weight = CW_SpecificReg;
34138 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
34139 if (C->getZExtValue() <= 31)
34140 weight = CW_Constant;
34144 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34145 if (C->getZExtValue() <= 63)
34146 weight = CW_Constant;
34150 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34151 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
34152 weight = CW_Constant;
34156 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34157 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
34158 weight = CW_Constant;
34162 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34163 if (C->getZExtValue() <= 3)
34164 weight = CW_Constant;
34168 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34169 if (C->getZExtValue() <= 0xff)
34170 weight = CW_Constant;
34175 if (isa<ConstantFP>(CallOperandVal)) {
34176 weight = CW_Constant;
34180 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34181 if ((C->getSExtValue() >= -0x80000000LL) &&
34182 (C->getSExtValue() <= 0x7fffffffLL))
34183 weight = CW_Constant;
34187 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34188 if (C->getZExtValue() <= 0xffffffff)
34189 weight = CW_Constant;
34196 /// Try to replace an X constraint, which matches anything, with another that
34197 /// has more specific requirements based on the type of the corresponding
34199 const char *X86TargetLowering::
34200 LowerXConstraint(EVT ConstraintVT) const {
34201 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
34202 // 'f' like normal targets.
34203 if (ConstraintVT.isFloatingPoint()) {
34204 if (Subtarget.hasSSE2())
34206 if (Subtarget.hasSSE1())
34210 return TargetLowering::LowerXConstraint(ConstraintVT);
34213 /// Lower the specified operand into the Ops vector.
34214 /// If it is invalid, don't add anything to Ops.
34215 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
34216 std::string &Constraint,
34217 std::vector<SDValue>&Ops,
34218 SelectionDAG &DAG) const {
34221 // Only support length 1 constraints for now.
34222 if (Constraint.length() > 1) return;
34224 char ConstraintLetter = Constraint[0];
34225 switch (ConstraintLetter) {
34228 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34229 if (C->getZExtValue() <= 31) {
34230 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34231 Op.getValueType());
34237 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34238 if (C->getZExtValue() <= 63) {
34239 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34240 Op.getValueType());
34246 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34247 if (isInt<8>(C->getSExtValue())) {
34248 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34249 Op.getValueType());
34255 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34256 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
34257 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
34258 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
34259 Op.getValueType());
34265 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34266 if (C->getZExtValue() <= 3) {
34267 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34268 Op.getValueType());
34274 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34275 if (C->getZExtValue() <= 255) {
34276 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34277 Op.getValueType());
34283 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34284 if (C->getZExtValue() <= 127) {
34285 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34286 Op.getValueType());
34292 // 32-bit signed value
34293 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34294 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34295 C->getSExtValue())) {
34296 // Widen to 64 bits here to get it sign extended.
34297 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
34300 // FIXME gcc accepts some relocatable values here too, but only in certain
34301 // memory models; it's complicated.
34306 // 32-bit unsigned value
34307 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34308 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34309 C->getZExtValue())) {
34310 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34311 Op.getValueType());
34315 // FIXME gcc accepts some relocatable values here too, but only in certain
34316 // memory models; it's complicated.
34320 // Literal immediates are always ok.
34321 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
34322 // Widen to 64 bits here to get it sign extended.
34323 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
34327 // In any sort of PIC mode addresses need to be computed at runtime by
34328 // adding in a register or some sort of table lookup. These can't
34329 // be used as immediates.
34330 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
34333 // If we are in non-pic codegen mode, we allow the address of a global (with
34334 // an optional displacement) to be used with 'i'.
34335 GlobalAddressSDNode *GA = nullptr;
34336 int64_t Offset = 0;
34338 // Match either (GA), (GA+C), (GA+C1+C2), etc.
34340 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
34341 Offset += GA->getOffset();
34343 } else if (Op.getOpcode() == ISD::ADD) {
34344 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34345 Offset += C->getZExtValue();
34346 Op = Op.getOperand(0);
34349 } else if (Op.getOpcode() == ISD::SUB) {
34350 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34351 Offset += -C->getZExtValue();
34352 Op = Op.getOperand(0);
34357 // Otherwise, this isn't something we can handle, reject it.
34361 const GlobalValue *GV = GA->getGlobal();
34362 // If we require an extra load to get this address, as in PIC mode, we
34363 // can't accept it.
34364 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
34367 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
34368 GA->getValueType(0), Offset);
34373 if (Result.getNode()) {
34374 Ops.push_back(Result);
34377 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
34380 /// Check if \p RC is a general purpose register class.
34381 /// I.e., GR* or one of their variant.
34382 static bool isGRClass(const TargetRegisterClass &RC) {
34383 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
34384 RC.hasSuperClassEq(&X86::GR16RegClass) ||
34385 RC.hasSuperClassEq(&X86::GR32RegClass) ||
34386 RC.hasSuperClassEq(&X86::GR64RegClass) ||
34387 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
34390 /// Check if \p RC is a vector register class.
34391 /// I.e., FR* / VR* or one of their variant.
34392 static bool isFRClass(const TargetRegisterClass &RC) {
34393 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
34394 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
34395 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
34396 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
34397 RC.hasSuperClassEq(&X86::VR512RegClass);
34400 std::pair<unsigned, const TargetRegisterClass *>
34401 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
34402 StringRef Constraint,
34404 // First, see if this is a constraint that directly corresponds to an LLVM
34406 if (Constraint.size() == 1) {
34407 // GCC Constraint Letters
34408 switch (Constraint[0]) {
34410 // TODO: Slight differences here in allocation order and leaving
34411 // RIP in the class. Do they matter any more here than they do
34412 // in the normal allocation?
34414 if (Subtarget.hasAVX512()) {
34415 // Only supported in AVX512 or later.
34416 switch (VT.SimpleTy) {
34419 return std::make_pair(0U, &X86::VK32RegClass);
34421 return std::make_pair(0U, &X86::VK16RegClass);
34423 return std::make_pair(0U, &X86::VK8RegClass);
34425 return std::make_pair(0U, &X86::VK1RegClass);
34427 return std::make_pair(0U, &X86::VK64RegClass);
34431 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
34432 if (Subtarget.is64Bit()) {
34433 if (VT == MVT::i32 || VT == MVT::f32)
34434 return std::make_pair(0U, &X86::GR32RegClass);
34435 if (VT == MVT::i16)
34436 return std::make_pair(0U, &X86::GR16RegClass);
34437 if (VT == MVT::i8 || VT == MVT::i1)
34438 return std::make_pair(0U, &X86::GR8RegClass);
34439 if (VT == MVT::i64 || VT == MVT::f64)
34440 return std::make_pair(0U, &X86::GR64RegClass);
34443 // 32-bit fallthrough
34444 case 'Q': // Q_REGS
34445 if (VT == MVT::i32 || VT == MVT::f32)
34446 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
34447 if (VT == MVT::i16)
34448 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
34449 if (VT == MVT::i8 || VT == MVT::i1)
34450 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
34451 if (VT == MVT::i64)
34452 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
34454 case 'r': // GENERAL_REGS
34455 case 'l': // INDEX_REGS
34456 if (VT == MVT::i8 || VT == MVT::i1)
34457 return std::make_pair(0U, &X86::GR8RegClass);
34458 if (VT == MVT::i16)
34459 return std::make_pair(0U, &X86::GR16RegClass);
34460 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
34461 return std::make_pair(0U, &X86::GR32RegClass);
34462 return std::make_pair(0U, &X86::GR64RegClass);
34463 case 'R': // LEGACY_REGS
34464 if (VT == MVT::i8 || VT == MVT::i1)
34465 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
34466 if (VT == MVT::i16)
34467 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
34468 if (VT == MVT::i32 || !Subtarget.is64Bit())
34469 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
34470 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
34471 case 'f': // FP Stack registers.
34472 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
34473 // value to the correct fpstack register class.
34474 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
34475 return std::make_pair(0U, &X86::RFP32RegClass);
34476 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
34477 return std::make_pair(0U, &X86::RFP64RegClass);
34478 return std::make_pair(0U, &X86::RFP80RegClass);
34479 case 'y': // MMX_REGS if MMX allowed.
34480 if (!Subtarget.hasMMX()) break;
34481 return std::make_pair(0U, &X86::VR64RegClass);
34482 case 'Y': // SSE_REGS if SSE2 allowed
34483 if (!Subtarget.hasSSE2()) break;
34486 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
34487 if (!Subtarget.hasSSE1()) break;
34488 bool VConstraint = (Constraint[0] == 'v');
34490 switch (VT.SimpleTy) {
34492 // Scalar SSE types.
34495 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
34496 return std::make_pair(0U, &X86::FR32XRegClass);
34497 return std::make_pair(0U, &X86::FR32RegClass);
34500 if (VConstraint && Subtarget.hasVLX())
34501 return std::make_pair(0U, &X86::FR64XRegClass);
34502 return std::make_pair(0U, &X86::FR64RegClass);
34503 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34511 if (VConstraint && Subtarget.hasVLX())
34512 return std::make_pair(0U, &X86::VR128XRegClass);
34513 return std::make_pair(0U, &X86::VR128RegClass);
34521 if (VConstraint && Subtarget.hasVLX())
34522 return std::make_pair(0U, &X86::VR256XRegClass);
34523 return std::make_pair(0U, &X86::VR256RegClass);
34528 return std::make_pair(0U, &X86::VR512RegClass);
34532 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
34533 switch (Constraint[1]) {
34537 // This register class doesn't allocate k0 for masked vector operation.
34538 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
34539 switch (VT.SimpleTy) {
34542 return std::make_pair(0U, &X86::VK32WMRegClass);
34544 return std::make_pair(0U, &X86::VK16WMRegClass);
34546 return std::make_pair(0U, &X86::VK8WMRegClass);
34548 return std::make_pair(0U, &X86::VK1WMRegClass);
34550 return std::make_pair(0U, &X86::VK64WMRegClass);
34557 // Use the default implementation in TargetLowering to convert the register
34558 // constraint into a member of a register class.
34559 std::pair<unsigned, const TargetRegisterClass*> Res;
34560 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
34562 // Not found as a standard register?
34564 // Map st(0) -> st(7) -> ST0
34565 if (Constraint.size() == 7 && Constraint[0] == '{' &&
34566 tolower(Constraint[1]) == 's' &&
34567 tolower(Constraint[2]) == 't' &&
34568 Constraint[3] == '(' &&
34569 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
34570 Constraint[5] == ')' &&
34571 Constraint[6] == '}') {
34573 Res.first = X86::FP0+Constraint[4]-'0';
34574 Res.second = &X86::RFP80RegClass;
34578 // GCC allows "st(0)" to be called just plain "st".
34579 if (StringRef("{st}").equals_lower(Constraint)) {
34580 Res.first = X86::FP0;
34581 Res.second = &X86::RFP80RegClass;
34586 if (StringRef("{flags}").equals_lower(Constraint)) {
34587 Res.first = X86::EFLAGS;
34588 Res.second = &X86::CCRRegClass;
34592 // 'A' means EAX + EDX.
34593 if (Constraint == "A") {
34594 Res.first = X86::EAX;
34595 Res.second = &X86::GR32_ADRegClass;
34601 // Otherwise, check to see if this is a register class of the wrong value
34602 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
34603 // turn into {ax},{dx}.
34604 // MVT::Other is used to specify clobber names.
34605 if (Res.second->hasType(VT) || VT == MVT::Other)
34606 return Res; // Correct type already, nothing to do.
34608 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
34609 // return "eax". This should even work for things like getting 64bit integer
34610 // registers when given an f64 type.
34611 const TargetRegisterClass *Class = Res.second;
34612 // The generic code will match the first register class that contains the
34613 // given register. Thus, based on the ordering of the tablegened file,
34614 // the "plain" GR classes might not come first.
34615 // Therefore, use a helper method.
34616 if (isGRClass(*Class)) {
34617 unsigned Size = VT.getSizeInBits();
34618 if (Size == 1) Size = 8;
34619 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
34621 Res.first = DestReg;
34622 Res.second = Size == 8 ? &X86::GR8RegClass
34623 : Size == 16 ? &X86::GR16RegClass
34624 : Size == 32 ? &X86::GR32RegClass
34625 : &X86::GR64RegClass;
34626 assert(Res.second->contains(Res.first) && "Register in register class");
34628 // No register found/type mismatch.
34630 Res.second = nullptr;
34632 } else if (isFRClass(*Class)) {
34633 // Handle references to XMM physical registers that got mapped into the
34634 // wrong class. This can happen with constraints like {xmm0} where the
34635 // target independent register mapper will just pick the first match it can
34636 // find, ignoring the required type.
34638 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34639 if (VT == MVT::f32 || VT == MVT::i32)
34640 Res.second = &X86::FR32RegClass;
34641 else if (VT == MVT::f64 || VT == MVT::i64)
34642 Res.second = &X86::FR64RegClass;
34643 else if (X86::VR128RegClass.hasType(VT))
34644 Res.second = &X86::VR128RegClass;
34645 else if (X86::VR256RegClass.hasType(VT))
34646 Res.second = &X86::VR256RegClass;
34647 else if (X86::VR512RegClass.hasType(VT))
34648 Res.second = &X86::VR512RegClass;
34650 // Type mismatch and not a clobber: Return an error;
34652 Res.second = nullptr;
34659 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
34660 const AddrMode &AM, Type *Ty,
34661 unsigned AS) const {
34662 // Scaling factors are not free at all.
34663 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
34664 // will take 2 allocations in the out of order engine instead of 1
34665 // for plain addressing mode, i.e. inst (reg1).
34667 // vaddps (%rsi,%drx), %ymm0, %ymm1
34668 // Requires two allocations (one for the load, one for the computation)
34670 // vaddps (%rsi), %ymm0, %ymm1
34671 // Requires just 1 allocation, i.e., freeing allocations for other operations
34672 // and having less micro operations to execute.
34674 // For some X86 architectures, this is even worse because for instance for
34675 // stores, the complex addressing mode forces the instruction to use the
34676 // "load" ports instead of the dedicated "store" port.
34677 // E.g., on Haswell:
34678 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
34679 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
34680 if (isLegalAddressingMode(DL, AM, Ty, AS))
34681 // Scale represents reg2 * scale, thus account for 1
34682 // as soon as we use a second register.
34683 return AM.Scale != 0;
34687 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
34688 // Integer division on x86 is expensive. However, when aggressively optimizing
34689 // for code size, we prefer to use a div instruction, as it is usually smaller
34690 // than the alternative sequence.
34691 // The exception to this is vector division. Since x86 doesn't have vector
34692 // integer division, leaving the division as-is is a loss even in terms of
34693 // size, because it will have to be scalarized, while the alternative code
34694 // sequence can be performed in vector form.
34695 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
34696 Attribute::MinSize);
34697 return OptSize && !VT.isVector();
34700 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
34701 if (!Subtarget.is64Bit())
34704 // Update IsSplitCSR in X86MachineFunctionInfo.
34705 X86MachineFunctionInfo *AFI =
34706 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
34707 AFI->setIsSplitCSR(true);
34710 void X86TargetLowering::insertCopiesSplitCSR(
34711 MachineBasicBlock *Entry,
34712 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
34713 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34714 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
34718 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34719 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
34720 MachineBasicBlock::iterator MBBI = Entry->begin();
34721 for (const MCPhysReg *I = IStart; *I; ++I) {
34722 const TargetRegisterClass *RC = nullptr;
34723 if (X86::GR64RegClass.contains(*I))
34724 RC = &X86::GR64RegClass;
34726 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
34728 unsigned NewVR = MRI->createVirtualRegister(RC);
34729 // Create copy from CSR to a virtual register.
34730 // FIXME: this currently does not emit CFI pseudo-instructions, it works
34731 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
34732 // nounwind. If we want to generalize this later, we may need to emit
34733 // CFI pseudo-instructions.
34734 assert(Entry->getParent()->getFunction()->hasFnAttribute(
34735 Attribute::NoUnwind) &&
34736 "Function should be nounwind in insertCopiesSplitCSR!");
34737 Entry->addLiveIn(*I);
34738 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
34741 // Insert the copy-back instructions right before the terminator.
34742 for (auto *Exit : Exits)
34743 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
34744 TII->get(TargetOpcode::COPY), *I)
34749 bool X86TargetLowering::supportSwiftError() const {
34750 return Subtarget.is64Bit();